Description

Create figures for the manuscript, extended abstract and poster.

packrat::init("/home/j.aguirreplans/Projects/Scipher/SampleSize/scripts/SampleSizeR")
## Initializing packrat project in directory:
## - "~/Projects/Scipher/SampleSize/scripts/SampleSizeR"
## Initialization complete!
library(data.table)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(igraph)
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
library(ggalluvial)
## Loading required package: ggplot2
library(gghalves)
library(ggplot2)
library(ggnewscale)
require(ggrepel)
## Loading required package: ggrepel
library(grid)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(gtable)
require(magrittr)
## Loading required package: magrittr
library(patchwork)
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:magrittr':
## 
##     extract
## The following object is masked from 'package:igraph':
## 
##     crossing
set.seed(1510)
options(bitmapType='cairo')
`%ni%` <- Negate(`%in%`)

Load data

Un-normalized data:

input_dir = '/home/j.aguirreplans/Projects/Scipher/SampleSize/scripts/SampleSizeShiny/data'
plots_dir = '/home/j.aguirreplans/Projects/Scipher/SampleSize/data/out/plots'
tables_dir = '/home/j.aguirreplans/Projects/Scipher/SampleSize/data/out/tables'
numbers_file = paste(input_dir, 'dataset_numbers_complete_graph.txt', sep='/')
numbers_df = fread(numbers_file)
numbers_df$dataset = tolower(numbers_df$dataset)
meta_file = paste(input_dir, "metadata_summary.txt", sep="/")
meta_df = fread(meta_file)
sd_genes_file = paste(input_dir, "variation_by_gene.txt", sep="/") # from => calculate_rnaseq_datasets_variation.Rmd
sd_genes_df = fread(sd_genes_file)
name2color <- c(
  "TCGA: Lung cancer" = "#56B4E9",
  "tcga:tcga-luad" = "#56B4E9",
  "tcga:tcga-luad-tumor" = "#56B4E9", # #D55E00
  "TCGA: Breast cancer" = "#0072B2",
  "TCGA: Breast c." = "#0072B2",
  "tcga:tcga-brca_female" = "#0072B2",
  "tcga:tcga-brca-tumor_female" = "#0072B2",
  "breast neoplasms"="#0072B2", 
  "GTEx: Whole blood" = "#65F3BF",
  "gtex:whole.blood" = "#65F3BF",
  "GTEx: Lung" = "#24D157",
  "gtex:lung" = "#24D157",
  "GTEx: Breast" = "#00BA37", #44AA99
  "gtex:breast.mammary.tissue" = "#00BA37",
  "gtex:breast.mammary.tissue_female" = "#00BA37",
  "R. arthritis" = "#D55E00", #E69F00
  "scipher:scipher.sample.per.patient.baseline" = "#D55E00",
  "arthritis rheumatoid"="#D55E00", 
  "asthma"="#d9f365", 
  "thyroid neoplasms"="#0072B2",
  "tcga" = "#0072B2",
  "TCGA" = "#0072B2",
  "gtex" = "#00BA37",
  "GTEx" = "#00BA37",
  "scipher" = "#D55E00",
  #"Analytical model"="#e3f705",
  "Analytical model"="#cc68f7",
  "Power law"="#cc68f7",
  "Logarithm"="#09b863",
  "Exponential decay"="#09b863",
  "\u2265 0.2" = "#F8766D",
  "\u2265 0.4" = "#7CAE00",
  "\u2265 0.6" = "#00BFC4",
  "\u2265 0.8" = "#C77CFF",
  "Very Weak" = "#ff7b00",
  "Weak" = "#F8766D",
  "Moderate" = "#7CAE00",
  "Strong" = "#00BFC4",
  "Very Strong" = "#C77CFF",
  "Convergence point" = "red",
  "Model prediction" = "red",
  "P-value < 0.05" = "red",
  "common" = "#619CFF",
  "disease-gene" = "#cc68f7",
  "disease-specific" = "#F8766D",
  "normal-specific" = "#00BA38",
  "undefined" = "#808080",
  "different" = "#C77CFF",
  "all" = "#CD9600",
  "common (constant)" = "#619CFF",
  "common (change)" = "#b5d1ff",
  "disease-specific (constant)" = "#F8766D",
  "disease-specific (change)" = "#ffcbc7",
  "normal-specific (constant)" = "#00BA38",
  "normal-specific (change)" = "#a3d1b1",
  "undefined (constant)" = "#808080",
  "undefined (change)" = "#c2c2c2"
  )

input_dir = '/home/j.aguirreplans/Projects/Scipher/SampleSize/scripts/SampleSizeShiny/data/example_pearson_pval_0.05'
topology_results_file = paste(input_dir, 'topology_results_pearson_pval_0.05.txt', sep='/')
results_selected_df = fread(topology_results_file)
topology_results_by_size_file = paste(input_dir, 'topology_results_mean_pearson_pval_0.05.txt', sep='/')
topology_results_selected_by_size_df = fread(topology_results_by_size_file)
predictions_file = paste(input_dir, 'predictions_pearson_pval_0.05.txt', sep='/')
predicted_results_df = fread(predictions_file)
analytical_results_file = paste(input_dir, 'analytical_model_results_pearson_pval_0.05.txt', sep='/')
topology_results_selected_analytical_df = fread(analytical_results_file)
analytical_summary_file = paste(input_dir, 'analytical_model_summary_pearson_pval_0.05.txt', sep='/')
analytical_model_summary_df = fread(analytical_summary_file)
analytical_regression_results_file = paste(input_dir, 'analytical_model_regression_results_pearson_pval_0.05.txt', sep='/')
stretched_exponential_regression_df = fread(analytical_regression_results_file)
theoretical_sample_size_file = paste(input_dir, 'theoretical_sample_size_for_correlations_of_datasets.txt', sep='/') # from => calculate_convergence_correlation_types.Rmd
sample_size_correlation_df = fread(theoretical_sample_size_file)

Normalized data:

topology_type_normalization = "divide.L"
input_dir = '/home/j.aguirreplans/Projects/Scipher/SampleSize/scripts/SampleSizeShiny/data/example_pearson_pval_0.05'
topology_results_file = paste(input_dir, '/', 'topology_results_norm_', topology_type_normalization, '_pearson_pval_0.05.txt', sep='')
results_selected_norm_df = fread(topology_results_file)
topology_results_by_size_file = paste(input_dir, '/', 'topology_results_mean_norm_', topology_type_normalization, '_pearson_pval_0.05.txt', sep='')
topology_results_selected_by_size_norm_df = fread(topology_results_by_size_file)
predictions_file = paste(input_dir, '/', 'predictions_', topology_type_normalization, '_pearson_pval_0.05.txt', sep='')
predicted_results_norm_df = fread(predictions_file)
analytical_results_file = paste(input_dir, '/', 'analytical_model_results_', topology_type_normalization, '_pearson_pval_0.05.txt', sep='')
topology_results_selected_analytical_norm_df = fread(analytical_results_file)

Number of networks and maximum sample size:

nrow((results_selected_df %>% 
        dplyr::select(method, type_dataset, size, rep) %>% 
        unique()))
## [1] 2239
max(((results_selected_df %>% 
        dplyr::select(method, type_dataset, size, rep) %>% 
        unique()))$size)
## [1] 1080

Number of networks and maximum sample size without counting TCGA full dataset:

nrow((results_selected_df %>% 
        dplyr::select(method, type_dataset, size, rep) %>% 
        filter(!(type_dataset == "tcga:tcga")) %>% 
        unique()))
## [1] 2239
max(((results_selected_df %>% 
        dplyr::select(method, type_dataset, size, rep) %>% 
        filter(!(type_dataset == "tcga:tcga")) %>% 
        unique()))$size)
## [1] 1080

Make plots

Curve plot

All datasets:

#datasets_selected = c("gtex:whole.blood", "gtex:artery.tibial", "tcga:tcga-coad", "tcga:tcga-brca")
#datasets_selected = c("gtex:whole.blood", "gtex:artery.tibial", "tcga:tcga-coad", "tcga:tcga-brca", "scipher:scipher.complete.dataset")
#datasets_selected = c("gtex:breast.mammary.tissue", "gtex:thyroid", "tcga:tcga-brca", "tcga:tcga-thca", "scipher:scipher.complete.dataset")
#datasets_selected = c("gtex:breast.mammary.tissue", "gtex:thyroid", "gtex:whole.blood", "tcga:tcga-brca", "tcga:tcga-thca", "scipher:scipher.complete.dataset")
datasets_selected = c("gtex:breast.mammary.tissue_female", 
                      "gtex:lung", 
                      "gtex:whole.blood", 
                      "tcga:tcga-brca_female", 
                      "tcga:tcga-luad", 
                      "scipher:scipher.sample.per.patient.baseline")

results_curve_df = results_selected_norm_df %>%
  dplyr::filter(type_dataset %in% datasets_selected) %>%
  dplyr::select(size, rep, unnorm, type_dataset) %>%
  mutate(type_dataset = replace(type_dataset, 
                                type_dataset == "scipher:scipher.sample.per.patient.baseline", 
                                "R. arthritis")) %>%
  mutate(type_dataset = replace(type_dataset, 
                                type_dataset == "gtex:whole.blood", 
                                "GTEx: Whole blood")) %>%
  mutate(type_dataset = replace(type_dataset, 
                                type_dataset == "gtex:lung", 
                                "GTEx: Lung")) %>%
  mutate(type_dataset = replace(type_dataset, 
                                type_dataset == "gtex:breast.mammary.tissue_female", 
                                "GTEx: Breast")) %>%
  mutate(type_dataset = replace(type_dataset, 
                                type_dataset == "tcga:tcga-luad", 
                                "TCGA: Lung cancer")) %>%
  mutate(type_dataset = replace(type_dataset, 
                                type_dataset == "tcga:tcga-brca_female", 
                                "TCGA: Breast cancer")) %>%
  inner_join( (name2color %>% 
                 as.data.frame() %>% 
                 dplyr::rename("rgb_col"=".") %>% 
                 tibble::rownames_to_column("type_dataset")), 
              by=c("type_dataset") ) %>%
  mutate(geom_text_repel_label = 
           dplyr::if_else(type_dataset == "R. arthritis" & size == 240 & rep == 1, 
                          "R. arthritis", 
                          dplyr::if_else(type_dataset == "GTEx: Whole blood" & size == 700 & rep == 1, 
                                         "GTEx: Whole blood", 
                                         dplyr::if_else(type_dataset == "GTEx: Lung" & size == 560 & rep == 1, "GTEx: Lung", 
                                                        dplyr::if_else(type_dataset == "GTEx: Breast" & size == 140 & rep == 1, 
                                                                       "GTEx: Breast", 
                                                                       dplyr::if_else(type_dataset == "TCGA: Lung cancer" & size == 460 & rep == 1, 
                                                                                      "TCGA: Lung cancer", 
                                                                                      dplyr::if_else(type_dataset == "TCGA: Breast cancer" & size == 600 & rep == 1, 
                                                                                                                          "TCGA: Breast cancer", "")))))))

# # Plot with legend
# curve_plot = results_curve_df %>%
#   ggplot(aes(x=size, y=unnorm, col=type_dataset)) +
#   geom_point(alpha=0.5, size=3) +
#   scale_color_manual(values = c(name2color["R. arthritis"][[1]], name2color["GTEx: Whole blood"][[1]], name2color["GTEx: Lung"][[1]], name2color["GTEx: Breast"][[1]], name2color["TCGA: Lung cancer"][[1]], name2color["TCGA: Breast cancer"][[1]])) +
#   theme_linedraw() +
#   xlab("Num. samples") +
#   #xlab("log(N)") +
#   ylab("Num. significant correlations") +
#   guides(col=guide_legend(title="Dataset")) +
#   theme(aspect.ratio=1, 
#         plot.title =  element_text(size = 20, face="bold"), 
#         axis.title = element_text(size = 17, face="bold"), 
#         axis.text = element_text(size = 16), 
#         legend.text = element_text(size = 16), 
#         legend.title=element_text(size=16, face="bold"))

# Plot with labels
curve_plot = results_curve_df %>%
  ggplot(aes(x=size, y=unnorm, label = geom_text_repel_label)) +
  geom_point(aes(col=rgb_col), alpha=0.5, size=3) +
  scale_colour_identity() +
  theme_linedraw() +
  xlab("Num. samples") +
  ylab("Num. significant correlations") +
  theme(aspect.ratio=1, 
        plot.title =  element_text(size = 20, face="bold"), 
        axis.title = element_text(size = 17, face="bold"), 
        axis.text = element_text(size = 16), 
        panel.grid.major=element_blank(), 
        panel.grid.minor = element_blank(),
        text = element_text(family = "Helvetica"),
        legend.position="none"
        ) +
  #geom_text_repel(aes(col=rgb_col), box.padding = 0.5, max.overlaps = Inf) +
  geom_label_repel(aes(col=rgb_col),
                   fill="white",
                   box.padding = 0.5, max.overlaps = Inf,
                   label.size = 0.75, 
                   size = 5,
                   alpha = 0.9, 
                   label.padding=0.25, 
                   fontface = "bold",
                   na.rm=TRUE,
                   seed = 1234)

plot_file = paste(plots_dir, "curve_pearson_pval_0.05.png", sep="/")
ggsave(
  plot_file,
  plot=curve_plot,
  dpi = 1200,
  #width = 10000,
  height = 6000,
  units = c("px")
)
## Saving 8400 x 6000 px image
print(curve_plot)

Less datasets:

datasets_selected = c("gtex:breast.mammary.tissue_female", 
                      "tcga:tcga-brca_female")

results_curve_lessdatasets_df = results_selected_norm_df %>%
  dplyr::filter(type_dataset %in% datasets_selected) %>%
  dplyr::select(size, rep, unnorm, type_dataset) %>%
  mutate(type_dataset = 
           replace(type_dataset, 
                   type_dataset == "gtex:breast.mammary.tissue_female", 
                   "GTEx: Breast")) %>%
  mutate(type_dataset = 
           replace(type_dataset, 
                   type_dataset == "tcga:tcga-brca_female", 
                   "TCGA: Breast cancer")) %>%
  inner_join( (name2color %>% 
                 as.data.frame() %>% 
                 dplyr::rename("rgb_col"=".") %>% 
                 tibble::rownames_to_column("type_dataset")), 
              by = c("type_dataset") ) %>%
  mutate(geom_text_repel_label = 
           dplyr::if_else(type_dataset == "R. arthritis" & size == 240 & rep == 1, 
                          "R. arthritis", 
                          dplyr::if_else(type_dataset == "GTEx: Whole blood" & size == 700 & rep == 1, 
                                         "GTEx: Whole blood", 
                                         dplyr::if_else(type_dataset == "GTEx: Lung" & size == 560 & rep == 1, "GTEx: Lung", 
                                                        dplyr::if_else(type_dataset == "GTEx: Breast" & size == 140 & rep == 1, 
                                                                       "GTEx: Breast", 
                                                                       dplyr::if_else(type_dataset == "TCGA: Lung cancer" & size == 460 & rep == 1, 
                                                                                      "TCGA: Lung cancer", 
                                                                                      dplyr::if_else(type_dataset == "TCGA: Breast cancer" & size == 600 & rep == 1, 
                                                                                                                          "TCGA: Breast cancer", "")))))))

# Plot with labels
curve_plot_lessdatasets = results_curve_lessdatasets_df %>%
  ggplot(aes(x=size, y=unnorm, label = geom_text_repel_label)) +
  geom_point(aes(col=rgb_col), alpha=0.5, size=3) +
  scale_colour_identity() +
  theme_linedraw() +
  xlab("Num. samples") +
  ylab("Num. significant correlations") +
  theme(aspect.ratio=1, 
        plot.title =  element_text(size = 20, face="bold"), 
        axis.title = element_text(size = 17, face="bold"), 
        axis.text = element_text(size = 16), 
        panel.grid.major=element_blank(), 
        panel.grid.minor = element_blank(),
        text = element_text(family = "Helvetica"),
        legend.position="none"
        ) +
  #geom_text_repel(aes(col=rgb_col), box.padding = 0.5, max.overlaps = Inf) +
  geom_label_repel(aes(col=rgb_col),
                   fill="white",
                   box.padding = 0.5, max.overlaps = Inf,
                   label.size = 0.75, 
                   size = 5,
                   alpha = 0.9, 
                   label.padding=0.25, 
                   fontface = "bold",
                   na.rm=TRUE,
                   seed = 1234)

plot_file = paste(plots_dir, "curve_pearson_pval_0.05_lessdatasets.png", sep="/")
ggsave(
  plot_file,
  plot=curve_plot_lessdatasets,
  dpi = 1200,
  #width = 10000,
  height = 6000,
  units = c("px")
)
## Saving 8400 x 6000 px image
print(curve_plot_lessdatasets)

Scaling relation plot

Calculate scaling relation plot:

datasets_selected = c("gtex:breast.mammary.tissue_female", 
                      "gtex:lung", 
                      "gtex:whole.blood", 
                      "tcga:tcga-brca_female", 
                      "tcga:tcga-luad", 
                      "scipher:scipher.sample.per.patient.baseline")

# Calculate data from scaling relationship
cols = c("size", 
         "S", 
         "type_dataset", 
         "S_lag1", 
         "Fn", 
         "slope", 
         "intercept", 
         "adj.r.squared")

scaling_relation_df = data.frame(matrix(ncol = length(cols), 
                                        nrow = 0, 
                                        dimnames = list(NULL, cols)))

for (dataset_selected in datasets_selected){
  scaling_relation_filt = topology_results_selected_by_size_df %>% 
    filter(type_dataset == dataset_selected) %>%
    select(size, mean, type_dataset) %>%
    unique() %>%
    rename("S"="mean") %>%
    arrange(size) %>%
    mutate(S_lag1 = if_else(size == min(size), 0, lag(S))) %>%
    mutate(Fn = ((S - S_lag1)/S_lag1)) %>%
    filter((!(is.infinite(Fn))) & (Fn > 0))
  lm_summary = summary(lm(log(scaling_relation_filt$Fn) ~ log(scaling_relation_filt$size)))
  slope = coef(lm_summary)[2]
  intercept = coef(lm_summary)[1]
  adj.r.squared = lm_summary$adj.r.squared
  scaling_relation_df = rbind(scaling_relation_df, 
                              cbind(scaling_relation_filt, 
                                    data.frame(slope = slope, 
                                               intercept = intercept, 
                                               adj.r.squared = adj.r.squared)))
}

# Process results
scaling_relation_df = scaling_relation_df %>%
  filter(Fn > 0) %>%
  mutate_at(c('adj.r.squared'), ~sprintf("%.2f",.)) %>%
  mutate_at(c('adj.r.squared'), as.character) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "scipher:scipher.sample.per.patient.baseline", "R. arthritis")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:whole.blood", "GTEx: Whole blood")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:lung", "GTEx: Lung")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:breast.mammary.tissue_female", "GTEx: Breast")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "tcga:tcga-luad", "TCGA: Lung cancer")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "tcga:tcga-brca_female", "TCGA: Breast cancer")) %>%
  inner_join( (name2color %>% as.data.frame() %>% dplyr::rename("rgb_col"=".") %>% tibble::rownames_to_column("type_dataset")), by=c("type_dataset") ) %>%
  mutate(geom_text_repel_label=dplyr::if_else(type_dataset == "R. arthritis" & size == 240, "R. arthritis", dplyr::if_else(type_dataset == "GTEx: Whole blood" & size == 160, "GTEx: Whole blood", dplyr::if_else(type_dataset == "GTEx: Lung" & size == 160, "GTEx: Lung", dplyr::if_else(type_dataset == "GTEx: Breast" & size == 140, "GTEx: Breast", dplyr::if_else(type_dataset == "TCGA: Lung cancer" & size == 120, "TCGA: Lung cancer", dplyr::if_else(type_dataset == "TCGA: Breast cancer" & size == 600, "TCGA: Breast cancer", "")))))))

# Plot with legend
# scaling_relation_plot = scaling_relation_df %>%
#   ggplot(aes(x=log(size), y=log(Fn), col=adj.r.squared)) +
#   geom_point(alpha=0.5, size=3) +
#   geom_abline(aes(intercept=intercept, slope=slope, col=adj.r.squared), size=1) +
#   theme_linedraw() +
#   xlab("log(n)") +
#   ylab(bquote(bold(log((S[n]-S[n-1])/S[n-1])))) +
#   #scale_color_manual(values = c("#D55E00", "#E69F00", "#44AA99", "#0072B2", "#56B4E9")) +
#   #scale_color_manual(values = c("#D55E00", "#E69F00", "#44AA99", "#65F3BF", "#0072B2", "#56B4E9")) +
#   scale_color_manual(values = c("#56B4E9", "#E69F00", "#44AA99", "#65F3BF", "#0072B2", "#D55E00")) +
#   guides(col=guide_legend(title=bquote(bold(R^2)))) +
#   theme(aspect.ratio=1, 
#         plot.title =  element_text(size = 20, face="bold"), 
#         axis.title = element_text(size = 17, face="bold"), 
#         axis.text = element_text(size = 16), 
#         legend.position = c(0.895, 0.77),
#         legend.text = element_text(size = 11), 
#         legend.title=element_text(size=13, face="bold"),
#         legend.title.align=0.5,
#         legend.background = element_rect(size=0.2, linetype="solid", colour ="black"))

# Plot with labels
scaling_relation_plot = scaling_relation_df %>%
  ggplot(aes(x=log(size), y=log(Fn), col=rgb_col)) +
  geom_point(alpha=0.5, size=3) +
  geom_abline(aes(intercept=intercept, slope=slope, col=rgb_col), size=1) +
  theme_linedraw() +
  xlab("log(n)") +
  ylab(bquote(bold(log((S[n]-S[n-1])/S[n-1])))) +
  scale_colour_identity() +
  guides(col=guide_legend(title=bquote(bold(R^2)))) +
  theme(aspect.ratio=1, 
        plot.title =  element_text(size = 20, face="bold"), 
        axis.title = element_text(size = 17, face="bold"), 
        axis.text = element_text(size = 16), 
        panel.grid.major=element_blank(), 
        panel.grid.minor = element_blank(),
        text = element_text(family = "Helvetica"),
        legend.position="none") +
  geom_label_repel(aes(col=rgb_col, label=geom_text_repel_label),
                   fill="white",
                   box.padding = 0.5, max.overlaps = Inf,
                   label.size = 0.75, 
                   size = 5,
                   alpha = 0.6, 
                   label.padding=0.25, 
                   fontface = "bold",
                   na.rm=TRUE,
                   seed = 1234)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
plot_file = paste(plots_dir, "scaling_relation_pearson_pval_0.05.png", sep="/")
ggsave(
  plot_file,
  plot=scaling_relation_plot,
  dpi = 1200,
  #width = 9500,
  height = 6000,
  units = c("px")
)
## Saving 8400 x 6000 px image
print(scaling_relation_plot)

Less datasets:

datasets_selected = c("gtex:breast.mammary.tissue_female", 
                      "tcga:tcga-brca_female")

# Calculate data from scaling relationship
cols = c("size", 
         "S", 
         "type_dataset", 
         "S_lag1", 
         "Fn", 
         "slope", 
         "intercept", 
         "adj.r.squared")

scaling_relation_lessdatasets_df = data.frame(matrix(ncol = length(cols),
                                                     nrow = 0, 
                                                     dimnames = list(NULL, cols)))

for (dataset_selected in datasets_selected){
  scaling_relation_filt = topology_results_selected_by_size_df %>% 
    filter(type_dataset == dataset_selected) %>%
    select(size, mean, type_dataset) %>%
    unique() %>%
    rename("S" = "mean") %>%
    arrange(size) %>%
    mutate(S_lag1 = if_else(size == min(size), 0, lag(S))) %>%
    mutate(Fn = ((S - S_lag1) / S_lag1)) %>%
    filter((!(is.infinite(Fn))) & (Fn > 0))
  lm_summary = summary(lm(log(scaling_relation_filt$Fn)~log(scaling_relation_filt$size)))
  slope = coef(lm_summary)[2]
  intercept = coef(lm_summary)[1]
  adj.r.squared = lm_summary$adj.r.squared
  scaling_relation_lessdatasets_df = rbind(scaling_relation_lessdatasets_df, 
                                           cbind(scaling_relation_filt, 
                                                 data.frame(slope = slope, 
                                                            intercept = intercept, 
                                                            adj.r.squared = adj.r.squared)))
}

# Process results
scaling_relation_lessdatasets_df = scaling_relation_lessdatasets_df %>%
  filter(Fn > 0) %>%
  mutate_at(c('adj.r.squared'), ~sprintf("%.2f",.)) %>%
  mutate_at(c('adj.r.squared'), as.character) %>%
  mutate(type_dataset = replace(type_dataset, 
                                type_dataset == "tcga:tcga-brca_female", 
                                "TCGA: Breast cancer")) %>%
  mutate(type_dataset = replace(type_dataset, 
                                type_dataset == "gtex:breast.mammary.tissue_female", 
                                "GTEx: Breast")) %>%
  inner_join( (name2color %>% 
                 as.data.frame() %>% 
                 dplyr::rename("rgb_col"=".") %>% 
                 tibble::rownames_to_column("type_dataset")), 
              by=c("type_dataset") ) %>%
  #mutate(geom_text_repel_label=dplyr::if_else(type_dataset == "TCGA: Breast cancer" & size == 220, "Power law", ""))
  mutate(geom_text_repel_label = 
           dplyr::if_else(type_dataset == "GTEx: Breast" & size == 140, "GTEx: Breast", 
                          dplyr::if_else(type_dataset == "TCGA: Breast cancer" & size == 400, "TCGA: Breast cancer", "")))

scaling_relation_lessdatasets_df$r2labels = 
  ifelse(scaling_relation_lessdatasets_df$type_dataset == "GTEx: Breast", 
         paste("GTEx Breast: ", 
               unique((scaling_relation_lessdatasets_df %>% 
                         filter(type_dataset == "GTEx: Breast"))$adj.r.squared), 
               sep=""), 
         ifelse(scaling_relation_lessdatasets_df$type_dataset == "TCGA: Breast cancer", 
                paste("TCGA Breast c.: ", 
                      unique((scaling_relation_lessdatasets_df %>% 
                                filter(type_dataset == "TCGA: Breast cancer"))$adj.r.squared), 
                      sep=""), 
                ""))

# Plot with labels
scaling_relation_plot_lessdatasets = scaling_relation_lessdatasets_df %>%
  ggplot(aes(x=log(size), y=log(Fn), col=r2labels)) +
  geom_point(alpha=0.9, size=3) +
  #geom_abline(aes(intercept=intercept, slope=slope), col="#cc68f7", size=1) +
  geom_abline(aes(intercept=intercept, slope=slope, col=r2labels), size=1, key_glyph = "smooth") +
  theme_linedraw() +
  xlab("log(n)") +
  ylab(bquote(bold(log((S[n]-S[n-1])/S[n-1])))) +
  scale_color_manual(guide = "legend", values=as.vector(name2color[levels(factor(scaling_relation_lessdatasets_df$type_dataset))])) +
  #scale_colour_identity(guide = 'legend', labels=levels(factor(scaling_relation_lessdatasets_df$r2labels))) +
  guides(col = guide_legend(title=bquote(bold(.('Power law') ~ R^2)))) +
  theme(aspect.ratio = 1, 
        plot.title =  element_text(size = 20, face="bold"), 
        axis.title = element_text(size = 17, face="bold"), 
        axis.text = element_text(size = 16), 
        panel.grid.major=element_blank(), 
        panel.grid.minor = element_blank(),
        text = element_text(family = "Helvetica"),
        #legend.position="none") +
        legend.position = c(0.64, 0.86),
        legend.text = element_text(size = 16), 
        legend.title = element_text(size=16, face="bold")
        #legend.background = element_rect(linewidth=0.2, linetype="solid", colour ="black"))
        )
  #geom_label_repel(aes(label=geom_text_repel_label),
                   #col="#cc68f7",
  # geom_label_repel(aes(col=rgb_col, label=geom_text_repel_label),
  #                  fill="white",
  #                  box.padding = 0.5, max.overlaps = Inf,
  #                  label.size = 0.75, 
  #                  size = 5,
  #                  alpha = 1, 
  #                  label.padding=0.25, 
  #                  fontface = "bold",
  #                  na.rm=TRUE,
  #                  seed = 1234)
  
plot_file = paste(plots_dir, 
                  "scaling_relation_pearson_pval_0.05_lessdatasets.png", 
                  sep="/")
ggsave(
  plot_file,
  plot=scaling_relation_plot_lessdatasets,
  dpi = 1200,
  #width = 9500,
  height = 6000,
  units = c("px")
)
## Saving 8400 x 6000 px image
print(scaling_relation_plot_lessdatasets)

The plots of two datasets separated:

datasets_selected <- list("gtex:breast.mammary.tissue_female" = "GTEx: Breast",
                          "tcga:tcga-brca_female" = "TCGA: Breast cancer")

y_bot_limit <- -6
y_top_limit <- 3.5
x_bot_limit <- 3.5
x_top_limit <- 7.0
x_annotation <- 5.75
y_annotation <- 2.0

scaling_relation_breast_df <- scaling_relation_lessdatasets_df %>%
  filter(type_dataset == "GTEx: Breast")

scaling_relation_plot_breast <- scaling_relation_breast_df %>%
  ggplot(aes(x=log(size), y=log(Fn), col=rgb_col)) +
    geom_point(alpha=0.9, size=3) +
    geom_abline(aes(intercept=intercept, slope=slope, col=rgb_col), size=1) +
    annotate("text", 
             x = x_annotation, 
             y = y_annotation, 
             label = paste("GTEx:\nBreast\nR² =",
                     round(as.numeric(unique(scaling_relation_breast_df$adj.r.squared)), 
                           3)), 
             size = 5) +
    theme_linedraw() +
    xlab("log(n)") +
    ylab(bquote(bold(log((S[n]-S[n-1])/S[n-1])))) +
    xlim(x_bot_limit, x_top_limit) +
    ylim(y_bot_limit, y_top_limit) +
    scale_colour_identity(guide = 'legend', labels=levels(factor(scaling_relation_breast_df$r2labels))) +
    guides(col=guide_legend(title=bquote(bold(.('Power law') ~ R^2)))) +
    theme(aspect.ratio=2/1, 
          plot.title =  element_text(size = 20, face="bold"), 
          axis.title = element_text(size = 17, face="bold"), 
          axis.text = element_text(size = 16), 
          #axis.title.x = element_blank(),
          panel.grid.major=element_blank(), 
          panel.grid.minor = element_blank(),
          text = element_text(family = "Helvetica"),
          legend.position = "none")


scaling_relation_brcancer_df <- scaling_relation_lessdatasets_df %>%
  filter(type_dataset == "TCGA: Breast cancer")

scaling_relation_plot_brcancer <- scaling_relation_brcancer_df %>%
  ggplot(aes(x=log(size), y=log(Fn), col=rgb_col)) +
    geom_point(alpha=0.9, size=3) +
    geom_abline(aes(intercept=intercept, slope=slope, col=rgb_col), size=1) +
    annotate("text", 
             x = x_annotation, 
             y = y_annotation, 
             label = paste("TCGA:\nBreast c.\nR² =",
                     round(as.numeric(unique(scaling_relation_brcancer_df$adj.r.squared)), 
                           3)), 
             size = 5) +
    theme_linedraw() +
    xlab("log(n)") +
    ylab(bquote(bold(log((S[n]-S[n-1])/S[n-1])))) +
    xlim(x_bot_limit, x_top_limit) +
    ylim(y_bot_limit, y_top_limit) +
    scale_colour_identity(guide = 'legend', labels=levels(factor(scaling_relation_brcancer_df$r2labels))) +
    guides(col=guide_legend(title=bquote(bold(.('Power law') ~ R^2)))) +
    theme(aspect.ratio=2/1, 
          plot.title =  element_text(size = 20, face="bold"), 
          axis.title = element_text(size = 17, face="bold"), 
          axis.text = element_text(size = 16), 
          axis.text.y = element_blank(),
          axis.ticks.y = element_blank(),
          #axis.title.x = element_blank(),
          axis.title.y = element_blank(),
          panel.grid.major=element_blank(), 
          panel.grid.minor = element_blank(),
          text = element_text(family = "Helvetica"),
          legend.position = "none")
# layout <- "AB"
# 
# ((scaling_relation_plot_breast + 
#     theme(plot.margin = margin(0.6, 0.1, 0.6, 0.6, "cm"))) +
#   (scaling_relation_plot_brcancer + 
#      theme(plot.margin = margin(0.6, 0.6, 0.6, 0.1, "cm")))) +
#   xlab("log(n)") +
#   plot_layout(design = layout) &
#   theme(element_text(face = 'bold', size = 17))

Regression plot

#datasets_selected = c("gtex:whole.blood", "gtex:artery.tibial", "tcga:tcga-coad", "tcga:tcga-brca")
#datasets_selected = c("gtex:whole.blood", "gtex:artery.tibial", "tcga:tcga-coad", "tcga:tcga-brca", "scipher:scipher.complete.dataset")
#datasets_selected = c("gtex:breast.mammary.tissue", "gtex:thyroid", "tcga:tcga-brca", "tcga:tcga-thca", "scipher:scipher.complete.dataset")
#datasets_selected = c("gtex:breast.mammary.tissue", "gtex:thyroid", "gtex:whole.blood", "tcga:tcga-brca", "tcga:tcga-thca", "scipher:scipher.complete.dataset")
datasets_selected = c("gtex:breast.mammary.tissue_female", 
                      "gtex:lung", 
                      "gtex:whole.blood", 
                      "tcga:tcga-brca_female", 
                      "tcga:tcga-luad", 
                      "scipher:scipher.sample.per.patient.baseline")

#model_selected = "Stretched exponential (by optimization)"
model_selected = "Stretched exponential (by linear fit)"

# Process data
stretched_exponential_regression_proc_df = stretched_exponential_regression_df %>% inner_join((topology_results_selected_analytical_df %>% dplyr::select(type_dataset, model, slope, intercept, a, b) %>% unique()), by=c("type_dataset", "model")) %>%
  filter((type_dataset %in% datasets_selected) & (model == model_selected)) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "scipher:scipher.sample.per.patient.baseline", "R. arthritis")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:whole.blood", "GTEx: Whole blood")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:lung", "GTEx: Lung")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:breast.mammary.tissue_female", "GTEx: Breast")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "tcga:tcga-luad", "TCGA: Lung cancer")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "tcga:tcga-brca_female", "TCGA: Breast cancer")) %>%
  inner_join( (name2color %>% as.data.frame() %>% dplyr::rename("rgb_col"=".") %>% tibble::rownames_to_column("type_dataset")), by=c("type_dataset") )

regression_plot = stretched_exponential_regression_proc_df %>%
  ggplot(aes(x=x, y=y, col=rgb_col)) +
  geom_point(alpha=0.5, size=3) +
  geom_abline(aes(intercept=intercept, slope=slope, col=rgb_col), size=1) +
  theme_linedraw() +
  xlab("log(n)") +
  ylab("log[ log(L) - log(S) ]") +
  #scale_color_manual(values = c("#D55E00", "#E69F00", "#44AA99", "#0072B2", "#56B4E9")) +
  #scale_color_manual(values = c("#D55E00", "#E69F00", "#44AA99", "#65F3BF", "#0072B2", "#56B4E9")) +
  #scale_color_manual(values = c("#56B4E9", "#E69F00", "#44AA99", "#65F3BF", "#0072B2", "#D55E00")) +
  scale_colour_identity(labels = unique(stretched_exponential_regression_proc_df$type_dataset), breaks = unique(stretched_exponential_regression_proc_df$rgb_col), guide = "legend") +
  guides(col=guide_legend(title="Dataset")) +
  theme(aspect.ratio=1,
        plot.title =  element_text(size = 20, face="bold"), 
        axis.title = element_text(size = 17, face="bold"), 
        axis.text = element_text(size = 16),
        panel.grid.major=element_blank(), 
        panel.grid.minor = element_blank(),
        legend.text = element_text(size = 16), 
        legend.title=element_text(size=16, face="bold"),
        text = element_text(family = "Helvetica"))

plot_file = paste(plots_dir, "regression_pearson_pval_0.05.png", sep="/")
ggsave(
  plot_file,
  plot=regression_plot,
  dpi = 1200,
  #width = 9500,
  height = 6000,
  units = c("px")
)
## Saving 8400 x 6000 px image
print(regression_plot)

Create a table with the results for the figure:

#datasets_selected = c("gtex:whole.blood", "gtex:artery.tibial", "tcga:tcga-coad", "tcga:tcga-brca")
#datasets_selected = c("gtex:whole.blood", "gtex:artery.tibial", "tcga:tcga-coad", "tcga:tcga-brca", "scipher:scipher.complete.dataset")
#datasets_selected = c("gtex:breast.mammary.tissue", "gtex:thyroid", "tcga:tcga-brca", "tcga:tcga-thca", "scipher:scipher.complete.dataset")
datasets_selected = c("gtex:breast.mammary.tissue_female", 
                      "gtex:lung", 
                      "gtex:whole.blood", 
                      "tcga:tcga-brca_female", 
                      "tcga:tcga-luad", 
                      "scipher:scipher.sample.per.patient.baseline")
#model_selected = "Stretched exponential (by optimization)"
model_selected = "Stretched exponential (by linear fit)"

# Show information for scaling relation
scaling_relation_summary_df = scaling_relation_df %>% 
  dplyr::select(type_dataset, adj.r.squared) %>% 
  unique() %>% 
  arrange(factor(type_dataset, levels = datasets_selected)) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "scipher:scipher.sample.per.patient.baseline", "R. arthritis")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:whole.blood", "GTEx: Whole blood")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:lung", "GTEx: Lung")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:breast.mammary.tissue_female", "GTEx: Breast")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "tcga:tcga-luad", "TCGA: Lung cancer")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "tcga:tcga-brca_female", "TCGA: Breast cancer")) %>%
  rename("R**2" = "adj.r.squared") %>%
  mutate_if(is.numeric, ~sprintf("%.2f",.))
print(scaling_relation_summary_df)
##           type_dataset R**2
## 1:        GTEx: Breast 0.94
## 2:          GTEx: Lung 0.65
## 3:   GTEx: Whole blood 0.70
## 4: TCGA: Breast cancer 0.69
## 5:   TCGA: Lung cancer 0.90
## 6:        R. arthritis 0.88
# Show all information for model selected
analytical_model_summary_df <- analytical_model_summary_df %>% 
  inner_join((results_selected_norm_df %>%
                select("type_dataset", "subclassification")), 
             by = c("type_dataset")) %>%
  unique()
analytical_model_summary_df$L_vs_total = abs((analytical_model_summary_df$unnorm_L - analytical_model_summary_df$total_num_edges)) / analytical_model_summary_df$total_num_edges
analytical_model_summary_df$dataset_name = analytical_model_summary_df$type_dataset
analytical_model_summary_df = 
  analytical_model_summary_df %>% 
  separate(col = "dataset_name", 
           into = c("dataset_name", "sex"), 
           sep = "_", 
           remove = TRUE) %>%
  separate(col = "dataset_name", 
           into = c("dataset", NULL), 
           sep = ":", 
           remove = FALSE) %>%
  mutate(sex = if_else(is.na(sex), "", sex))
## Warning: Expected 2 pieces. Additional pieces discarded in 24 rows [20, 21, 22, 42, 43,
## 44, 64, 65, 66, 86, 87, 88, 108, 109, 110, 130, 131, 132, 152, 153, ...].
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 128 rows [1, 2, 3, 5, 6,
## 7, 8, 10, 11, 12, 13, 14, 16, 17, 18, 19, 23, 24, 25, 27, ...].
## Warning: Expected 1 pieces. Additional pieces discarded in 176 rows [1, 2, 3, 4, 5, 6,
## 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
analytical_model_summary_df$dataset_name = 
  ifelse(!(analytical_model_summary_df$subclassification == ""),
         paste(analytical_model_summary_df$dataset_name, 
               analytical_model_summary_df$subclassification, 
               sep = "-"), 
         analytical_model_summary_df$dataset_name)
analytical_model_summary_df$dataset_name = 
  ifelse(!(analytical_model_summary_df$sex == ""),
         paste(analytical_model_summary_df$dataset_name, 
               analytical_model_summary_df$sex, 
               sep = "_"), 
         analytical_model_summary_df$dataset_name)

analytical_model_summary_df %>% 
  dplyr::select(type_dataset, model, a, b, L, L_vs_total, adj.r.squared, relative.error.mean) %>% 
  filter((type_dataset %in% datasets_selected) & (model == model_selected)) %>% unique() %>%
  arrange(factor(type_dataset, levels = datasets_selected, datasets_selected))
##                                  type_dataset
## 1           gtex:breast.mammary.tissue_female
## 2                                   gtex:lung
## 3                            gtex:whole.blood
## 4                       tcga:tcga-brca_female
## 5                              tcga:tcga-luad
## 6 scipher:scipher.sample.per.patient.baseline
##                                   model        a        b        L L_vs_total
## 1 Stretched exponential (by linear fit) 1.811947 64.89496 3.885985          0
## 2 Stretched exponential (by linear fit) 1.654318 28.07262 1.964261          0
## 3 Stretched exponential (by linear fit) 1.650687 21.33857 1.586486          0
## 4 Stretched exponential (by linear fit) 1.478753 17.41578 3.599061          0
## 5 Stretched exponential (by linear fit) 1.649890 37.43779 2.749527          0
## 6 Stretched exponential (by linear fit) 1.758283 41.83477 2.332343          0
##   adj.r.squared relative.error.mean
## 1     0.9963943          0.30863165
## 2     0.9838666          0.28316122
## 3     0.9832295          0.16228112
## 4     0.9974590          0.06618629
## 5     0.9979574          0.08379174
## 6     0.9908833          0.13866336
# Show all information for logarithmic
log_model_summary_df = analytical_model_summary_df %>% 
  dplyr::select(type_dataset, model, a, b, L, L_vs_total, adj.r.squared, relative.error.mean) %>% 
  filter((type_dataset %in% datasets_selected) & (model == "Logarithmic")) %>% unique() %>% 
  dplyr::select(type_dataset, adj.r.squared, relative.error.mean) %>% 
  rename("R**2" = "adj.r.squared", "epsilon" = "relative.error.mean") %>%
  arrange(factor(type_dataset, levels = datasets_selected)) %>%
  mutate_if(is.numeric, ~sprintf("%.2f",.))
print(log_model_summary_df)
##                                  type_dataset R**2 epsilon
## 1           gtex:breast.mammary.tissue_female 0.88    8.83
## 2                                   gtex:lung 0.95   10.44
## 3                            gtex:whole.blood 0.99    1.15
## 4                       tcga:tcga-brca_female 0.88    7.94
## 5                              tcga:tcga-luad 0.89   12.89
## 6 scipher:scipher.sample.per.patient.baseline 0.95    2.93
# Create table for figure
power_law_summary_df = analytical_model_summary_df %>% 
  dplyr::select(type_dataset, model, a, adj.r.squared, relative.error.mean) %>% 
  filter((type_dataset %in% datasets_selected) & (model == model_selected)) %>%
  dplyr::select(-model) %>%
  arrange(factor(type_dataset, levels = datasets_selected)) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "scipher:scipher.sample.per.patient.baseline", "R. arthritis")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:whole.blood", "GTEx: Whole blood")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:lung", "GTEx: Lung")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:breast.mammary.tissue_female", "GTEx: Breast")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "tcga:tcga-luad", "TCGA: Lung cancer")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "tcga:tcga-brca_female", "TCGA: Breast cancer")) %>%
  rename("alpha"="a", "R**2" = "adj.r.squared", "epsilon" = "relative.error.mean") %>%
  unique() %>% 
  mutate_if(is.numeric, ~sprintf("%.2f",.))
print(power_law_summary_df)
##          type_dataset alpha R**2 epsilon
## 1        GTEx: Breast  1.81 1.00    0.31
## 2          GTEx: Lung  1.65 0.98    0.28
## 3   GTEx: Whole blood  1.65 0.98    0.16
## 4 TCGA: Breast cancer  1.48 1.00    0.07
## 5   TCGA: Lung cancer  1.65 1.00    0.08
## 6        R. arthritis  1.76 0.99    0.14
# Bind power law with logarithm results
#figure_summary_table = cbind(power_law_summary_df, (log_model_summary_df %>% select(-type_dataset)))
figure_summary_table = cbind(scaling_relation_summary_df, (power_law_summary_df %>% select(-type_dataset)), (log_model_summary_df %>% select(-type_dataset)))

# Assign first row as row names
rownames(figure_summary_table) <- figure_summary_table$type_dataset
figure_summary_table$type_dataset <- NULL

# Customize and print table
name2colorlight <- c(
  "TCGA: Lung cancer" = "#c0e5fa",
  "TCGA: Breast cancer" = "#2bb3ff",
  "breast neoplasms"="#2bb3ff", 
  "GTEx: Whole blood" = "#c3f7e4",
  "GTEx: Lung" = "#78ffa0",
  "GTEx: Breast" = "#47d671",
  "R. arthritis" = "#ffd6b5"
  )

colors_original_figure = as.vector(name2color[unique(rownames(figure_summary_table))])
colors_original_figure = c(tail(colors_original_figure, 1), head(colors_original_figure, -1)) # I don't know why, the color in fg_params starts from 2nd position, so I have to reorder the vector of colors in order to match the other colors
colors_light_figure = as.vector(name2colorlight[unique(rownames(figure_summary_table))])
core_figure = gridExtra::tableGrob(figure_summary_table, theme=ttheme_minimal(
  colhead=list(bg_params = list(fill = NA, col="black", lwd=1), 
               fg_params = list(parse=TRUE, fontface="bold", fontsize=18)), 
  rowhead=list(bg_params = list(fill = NA, col=NA), 
               fg_params=list(col="black", fontface="plain", fontsize=18)), 
  core=list(bg_params=list(fill=colors_light_figure, col="black", lwd=1), fg_params=list(fontsize=18))))
#header <- gridExtra::tableGrob(figure_summary_table[1, 1:2], rows=NULL, cols=c("Power law", "Logarithm"), theme=ttheme_minimal(
header <- gridExtra::tableGrob(figure_summary_table[1, 1:3], rows=NULL, cols=c("Power law", "Analytical model", "Logarithm"), theme=ttheme_minimal(
  colhead=list(bg_params = list(fill = NA, col="black", lwd=1), 
               fg_params = list(parse=TRUE, fontface="plain", fontsize=18))))
figure_summary_table_plot <- gtable_combine(header[1,], core_figure, along=2)
#figure_summary_table_plot$widths[2:length(figure_summary_table_plot$widths)] <- rep(max(figure_summary_table_plot$widths[2:length(figure_summary_table_plot$widths)]), length(figure_summary_table_plot$widths[2:length(figure_summary_table_plot$widths)])) # make column widths equal
figure_summary_table_plot$widths[3:length(figure_summary_table_plot$widths)] <- rep(max(figure_summary_table_plot$widths[3:length(figure_summary_table_plot$widths)]), length(figure_summary_table_plot$widths[3:length(figure_summary_table_plot$widths)])) # make columns 3 to end widths equal
figure_summary_table_plot$widths[2] <- rep(max(figure_summary_table_plot$widths[2])*0.7, length(figure_summary_table_plot$widths[2])) # make column 2 smaller
#grid.newpage()
#grid.draw(figure_summary_table_plot) # see what it looks like before altering gtable
# change the relevant rows of gtable
grid.newpage()
#figure_summary_table_plot$layout[1:4 , c("l", "r")] <- list(c(2, 7), c(6, 8)) # For 8 columns (including b and L)
#figure_summary_table_plot$layout[1:4 , c("l", "r")] <- list(c(2, 5), c(4, 6))
figure_summary_table_plot$layout[1:6 , c("l", "r")] <- list(c(2, 3, 6), c(2, 5, 7))
grid.draw(figure_summary_table_plot)

Prediction plot

All datasets:

#datasets_selected = c("gtex:whole.blood", "gtex:artery.tibial", "tcga:tcga-coad", "tcga:tcga-brca")
#datasets_selected = c("gtex:whole.blood", "gtex:artery.tibial", "tcga:tcga-coad", "tcga:tcga-brca", "scipher:scipher.complete.dataset")
#datasets_selected = c("gtex:breast.mammary.tissue", "gtex:thyroid", "tcga:tcga-brca", "tcga:tcga-thca", "scipher:scipher.complete.dataset")
#datasets_selected = c("gtex:breast.mammary.tissue", "gtex:thyroid", "gtex:whole.blood", "tcga:tcga-brca", "tcga:tcga-thca", "scipher:scipher.complete.dataset")
datasets_selected = c("gtex:breast.mammary.tissue_female", 
                      "gtex:lung", 
                      "gtex:whole.blood", 
                      "tcga:tcga-brca_female", 
                      "tcga:tcga-luad", 
                      "scipher:scipher.sample.per.patient.baseline")

#model_selected = "Stretched exponential (by optimization)"
model_selected = "Stretched exponential (by linear fit)"

# Process data
prediction_plot_df = results_selected_norm_df %>% 
  dplyr::right_join((predicted_results_norm_df %>% dplyr::select(type_dataset, model, model_result, size) %>% mutate_at(c('model_result'), as.numeric) %>% unique()), by=c("type_dataset", "size")) %>%
  dplyr::filter((type_dataset %in% datasets_selected) & (model == model_selected)) %>%
  dplyr::select(size, rep, type_dataset, num_edges, model_result) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "scipher:scipher.sample.per.patient.baseline", "R. arthritis")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:whole.blood", "GTEx: Whole blood")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:lung", "GTEx: Lung")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:breast.mammary.tissue_female", "GTEx: Breast")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "tcga:tcga-luad", "TCGA: Lung cancer")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "tcga:tcga-brca_female", "TCGA: Breast cancer")) %>%
  inner_join( (name2color %>% as.data.frame() %>% dplyr::rename("rgb_col"=".") %>% tibble::rownames_to_column("type_dataset")), by=c("type_dataset") ) %>%
  mutate(geom_text_repel_label=dplyr::if_else(type_dataset == "R. arthritis" & size == 240 & rep == 1, "R. arthritis", dplyr::if_else(type_dataset == "GTEx: Whole blood" & size == 700 & rep == 1, "GTEx: Whole blood", dplyr::if_else(type_dataset == "GTEx: Lung" & size == 560 & rep == 1, "GTEx: Lung", dplyr::if_else(type_dataset == "GTEx: Breast" & size == 140 & rep == 1, "GTEx: Breast", dplyr::if_else(type_dataset == "TCGA: Lung cancer" & size == 460 & rep == 1, "TCGA: Lung cancer", dplyr::if_else(type_dataset == "TCGA: Breast cancer" & size == 800 & rep == 1, "TCGA: Breast cancer", "")))))))

# # Plot with legend
# prediction_plot = prediction_plot_df %>%
#   ggplot(aes(x=size, y=num_edges, col=type_dataset)) +
#   geom_point(alpha=0.5, size=3) +
#   #geom_line(data=(predicted_results_norm_df %>% filter((type_dataset %in% datasets_selected) & (model == model_selected)) %>% unique()), aes(x=size, y=model_result, col=type_dataset), size=1) +
#   geom_line(aes(x=size, y=model_result, col=type_dataset), size=1) +
#   scale_x_continuous(trans = scales::log10_trans()) +
#   #scale_color_manual(values = c("#D55E00", "#E69F00", "#44AA99", "#0072B2", "#56B4E9")) +
#   #scale_color_manual(values = c("#D55E00", "#E69F00", "#44AA99", "#65F3BF", "#0072B2", "#56B4E9")) +
#   #scale_color_manual(values = c("#56B4E9", "#E69F00", "#44AA99", "#65F3BF", "#0072B2", "#D55E00")) +
#   scale_color_manual(values = as.vector(name2color[levels(factor(prediction_plot_df$type_dataset))])) + # Plot using the dictionary
#   theme_linedraw() +
#   xlab("Num. samples (Log)") +
#   ylab("Frac. significant correlations") +
#   guides(col=guide_legend(title="Dataset")) +
#   theme(aspect.ratio=1, 
#         plot.title =  element_text(size = 20, face="bold"), 
#         axis.title = element_text(size = 17, face="bold"), 
#         axis.text = element_text(size = 16), 
#         legend.text = element_text(size = 16), 
#         legend.title=element_text(size=16, face="bold"))

# Plot with labels
prediction_plot = prediction_plot_df %>%
  ggplot(aes(x=log10(size), y=num_edges, col=rgb_col)) +
  geom_point(alpha=0.5, size=3) +
  #geom_line(data=(predicted_results_norm_df %>% filter((type_dataset %in% datasets_selected) & (model == model_selected)) %>% unique()), aes(x=size, y=model_result, col=type_dataset), size=1) +
  geom_line(aes(x=log10(size), y=model_result, col=rgb_col), size=1) +
  #scale_x_continuous(trans = scales::log10_trans()) +
  #scale_color_manual(values = as.vector(name2color[levels(factor(prediction_plot_df$type_dataset))])) + # Plot using the dictionary
  scale_colour_identity() +
  theme_linedraw() +
  xlab("Num. samples (Log10)") +
  ylab("Frac. significant correlations") +
  theme(aspect.ratio=1, 
        plot.title =  element_text(size = 20, face="bold"), 
        axis.title = element_text(size = 17, face="bold"), 
        axis.text = element_text(size = 16), 
        panel.grid.major=element_blank(), 
        panel.grid.minor = element_blank(),
        text = element_text(family = "Helvetica"),
        legend.position="none") +
  geom_label_repel(aes(col=rgb_col, label=geom_text_repel_label),
                   fill="white",
                   box.padding = 0.5, max.overlaps = Inf,
                   label.size = 0.75, 
                   size = 5,
                   alpha = 0.9, 
                   label.padding=0.25, 
                   fontface = "bold",
                   na.rm=TRUE,
                   seed = 1234)

plot_file = paste(plots_dir, "prediction_pearson_pval_0.05.png", sep="/")
print(prediction_plot)
## Warning: Removed 29835 rows containing missing values (`geom_point()`).

ggsave(
  plot_file,
  plot=prediction_plot,
  dpi = 1200,
  #width = 9500,
  #height = 6000,
  units = c("px")
)
## Saving 8400 x 6000 px image
## Warning: Removed 29835 rows containing missing values (`geom_point()`).

Less datasets:

datasets_selected = c("gtex:breast.mammary.tissue_female", 
                      "tcga:tcga-brca_female")
model_selected = "Stretched exponential (by linear fit)"

# Process data
prediction_plot_lessdatasets_df = results_selected_norm_df %>% 
  dplyr::right_join((predicted_results_norm_df %>% 
                       dplyr::select(type_dataset, model, model_result, size) %>% 
                       mutate_at(c('model_result'), as.numeric) %>% 
                       unique()), by=c("type_dataset", "size")) %>%
  dplyr::filter((type_dataset %in% datasets_selected) & (model == model_selected)) %>%
  dplyr::select(size, rep, type_dataset, num_edges, model_result) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:breast.mammary.tissue_female", "GTEx: Breast")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "tcga:tcga-brca_female", "TCGA: Breast c.")) %>%
  inner_join( (name2color %>% as.data.frame() %>% dplyr::rename("rgb_col"=".") %>% tibble::rownames_to_column("type_dataset")), by=c("type_dataset") ) %>%
  mutate(geom_text_repel_label=dplyr::if_else(type_dataset == "R. arthritis" & size == 240 & rep == 1, "R. arthritis", dplyr::if_else(type_dataset == "GTEx: Whole blood" & size == 700 & rep == 1, "GTEx: Whole blood", dplyr::if_else(type_dataset == "GTEx: Lung" & size == 560 & rep == 1, "GTEx: Lung", dplyr::if_else(type_dataset == "GTEx: Breast" & size == 140 & rep == 1, "GTEx: Breast", dplyr::if_else(type_dataset == "TCGA: Lung cancer" & size == 460 & rep == 1, "TCGA: Lung cancer", dplyr::if_else(type_dataset == "TCGA: Breast c." & size == 1000 & rep == 1, "TCGA: Breast c.", "")))))))

# Plot with labels
prediction_plot_lessdatasets = prediction_plot_lessdatasets_df %>%
  ggplot(aes(x=log10(size), y=num_edges, col=rgb_col)) +
  geom_point(alpha=0.5, size=3) +
  #geom_line(data=(predicted_results_norm_df %>% filter((type_dataset %in% datasets_selected) & (model == model_selected)) %>% unique()), aes(x=size, y=model_result, col=type_dataset), size=1) +
  geom_line(aes(x=log10(size), y=model_result, col=rgb_col), size=1) +
  #scale_x_continuous(trans = scales::log10_trans()) +
  #scale_color_manual(values = as.vector(name2color[levels(factor(prediction_plot_lessdatasets_df$type_dataset))])) + # Plot using the dictionary
  #scale_colour_identity() +
  scale_colour_identity(labels = unique(prediction_plot_lessdatasets_df$type_dataset), breaks = unique(prediction_plot_lessdatasets_df$rgb_col), guide = "legend") +
  theme_linedraw() +
  xlab("Num. samples (Log10)") +
  ylab("Frac. significant correlations") +
  guides(col=guide_legend(title = NULL)) +
  theme(aspect.ratio=1, 
        plot.title =  element_text(size = 20, face="bold"), 
        axis.title = element_text(size = 17, face="bold"), 
        axis.text = element_text(size = 16), 
        panel.grid.major=element_blank(), 
        panel.grid.minor = element_blank(),
        legend.position = c(0.28, 0.92),
        legend.text = element_text(size = 16), 
        legend.title = element_text(size = 16, face="bold"),
        #legend.position="none"),
        legend.background = element_rect(fill = 'transparent', color = "transparent"),
        text = element_text(family = "Helvetica")) #+
  # geom_label_repel(aes(col=rgb_col, label=geom_text_repel_label),
  #                  fill="white",
  #                  box.padding = 0.5, max.overlaps = Inf,
  #                  label.size = NA, 
  #                  size = 5,
  #                  alpha = 0.9, 
  #                  label.padding=0.25, 
  #                  fontface = "bold",
  #                  na.rm=TRUE,
  #                  seed = 1234)

plot_file = paste(plots_dir, "prediction_pearson_pval_0.05_lessdatasets.png", sep="/")
print(prediction_plot_lessdatasets)
## Warning: Removed 9938 rows containing missing values (`geom_point()`).

ggsave(
  plot_file,
  plot=prediction_plot_lessdatasets,
  dpi = 1200,
  #width = 9500,
  #height = 6000,
  units = c("px")
)
## Saving 8400 x 6000 px image
## Warning: Removed 9938 rows containing missing values (`geom_point()`).

Number of samples for a given % of significant edges:

datasets_selected = c("gtex:breast.mammary.tissue_female", 
                      "gtex:lung", 
                      "gtex:whole.blood", 
                      "tcga:tcga-brca_female", 
                      "tcga:tcga-luad", 
                      "scipher:scipher.sample.per.patient.baseline")

model_selected = "Stretched exponential (by linear fit)"
predicted_results_norm_df %>% dplyr::select(type_dataset, model, model_result, size) %>%
  mutate_at(c('model_result'), as.numeric) %>%
  filter((type_dataset %in% datasets_selected) & (model == model_selected)) %>%
  group_by(type_dataset) %>% 
  filter((abs(model_result - 0.25) == min(abs(model_result - 0.25))) | (abs(model_result - 0.5) == min(abs(model_result - 0.5))) | (abs(model_result - 0.75) == min(abs(model_result - 0.75))))
## # A tibble: 18 × 4
## # Groups:   type_dataset [6]
##    type_dataset                                model               model…¹  size
##    <chr>                                       <chr>                 <dbl> <int>
##  1 scipher:scipher.sample.per.patient.baseline Stretched exponent…   0.252   130
##  2 scipher:scipher.sample.per.patient.baseline Stretched exponent…   0.499   320
##  3 scipher:scipher.sample.per.patient.baseline Stretched exponent…   0.749  1020
##  4 gtex:breast.mammary.tissue_female           Stretched exponent…   0.255   150
##  5 gtex:breast.mammary.tissue_female           Stretched exponent…   0.503   350
##  6 gtex:breast.mammary.tissue_female           Stretched exponent…   0.750  1020
##  7 gtex:lung                                   Stretched exponent…   0.250   190
##  8 gtex:lung                                   Stretched exponent…   0.501   550
##  9 gtex:lung                                   Stretched exponent…   0.750  2100
## 10 gtex:whole.blood                            Stretched exponent…   0.251   130
## 11 gtex:whole.blood                            Stretched exponent…   0.503   380
## 12 gtex:whole.blood                            Stretched exponent…   0.750  1450
## 13 tcga:tcga-brca_female                       Stretched exponent…   0.250   920
## 14 tcga:tcga-brca_female                       Stretched exponent…   0.500  3910
## 15 tcga:tcga-brca_female                       Stretched exponent…   0.750 24570
## 16 tcga:tcga-luad                              Stretched exponent…   0.250   310
## 17 tcga:tcga-luad                              Stretched exponent…   0.500   900
## 18 tcga:tcga-luad                              Stretched exponent…   0.750  3480
## # … with abbreviated variable name ¹​model_result

Comparison of models: stretched exponential vs. logarithm

Comparison of stretched exponential with logarithm:

models_selected = c("Stretched exponential (by linear fit)", "Logarithmic", "Mean")
#dataset_selected = "gtex:whole.blood"
dataset_selected = "tcga:tcga-brca_female"
comparison_palette = name2color[c(dataset_selected, "Analytical model", "Logarithm")]

# Join predicted results with mean
predicted_results_norm_and_mean_df = rbind((topology_results_selected_by_size_norm_df %>% dplyr::select(type_dataset, size, mean_norm) %>% unique() %>% rename("model_result" = "mean_norm") %>% mutate(model="Mean")), (predicted_results_norm_df %>% dplyr::select(type_dataset, size, model_result, model) %>% unique() %>% mutate_at(c('model_result'), as.numeric))) %>% 
  filter(model %in% models_selected) %>%
  mutate(model = replace(model, model == "Stretched exponential (by linear fit)", "Analytical model")) %>%
  mutate(model = replace(model, model == "Logarithmic", "Logarithm"))
predicted_results_norm_and_mean_df$model <- factor(predicted_results_norm_and_mean_df$model, levels = c("Mean", "Analytical model", "Logarithm"))

# Process data
comparison_log_df = results_selected_norm_df %>% 
  dplyr::select(type_dataset, size, rep, num_edges) %>%
  right_join(predicted_results_norm_and_mean_df, by=c("type_dataset", "size")) %>%
  filter(type_dataset == dataset_selected) %>%
  filter(size <= max((results_selected_norm_df %>% filter(type_dataset == dataset_selected))$size)) %>%
  filter(size >= min((results_selected_norm_df %>% filter(type_dataset == dataset_selected))$size)) %>%
  inner_join( (name2color %>% as.data.frame() %>% dplyr::rename("rgb_col"=".") %>% tibble::rownames_to_column("model") %>% mutate(model = replace(model, model == dataset_selected, "Mean"))), by=c("model") ) %>%
  mutate(geom_text_repel_label=dplyr::if_else(model == "Logarithm" & size == 280 & rep == 1, "Logarithm", dplyr::if_else(model == "Analytical model" & size == 760 & rep == 1, "Analytical model", dplyr::if_else(model == "Mean" & size == 560 & rep == 1, "Mean", ""))))

# # Plot with legend
# comparison_plot = comparison_log_df %>%
#   ggplot(aes(x=size, y=num_edges, col=model)) +
#   geom_point(alpha=0.1, size=3, col=comparison_palette[1]) +
#   geom_line(aes(x=size, y=model_result, col=model), size=1) +
#   #scale_x_continuous(trans = scales::log10_trans()) +
#   scale_color_manual(values = comparison_palette) +
#   theme_linedraw() +
#   xlab("Num. samples") +
#   ylab("Frac. significant correlations") +
#   #guides(col=guide_legend(title="Model")) +
#   theme(aspect.ratio=1,
#         plot.title =  element_text(size = 20, face="bold"),
#         axis.title = element_text(size = 17, face="bold"),
#         axis.text = element_text(size = 16),
#         legend.text = element_text(size = 16), 
#         #legend.title=element_text(size=16, face="bold"), 
#         legend.title=element_blank(), 
#         legend.position="bottom")

# Plot with labels
comparison_plot = comparison_log_df %>%
  ggplot(aes(x=size, y=num_edges)) +
  geom_point(alpha=0.1, size=3, col=comparison_palette[[dataset_selected]]) +
  geom_line(aes(x=size, y=model_result, col=rgb_col), size=1) +
  scale_colour_identity() +
  theme_linedraw() +
  xlab("Num. samples") +
  ylab("Frac. significant correlations") +
  theme(aspect.ratio=1, 
        plot.title =  element_text(size = 20, face="bold"), 
        axis.title = element_text(size = 17, face="bold"), 
        axis.text = element_text(size = 16), 
        panel.grid.major=element_blank(), 
        panel.grid.minor = element_blank(),
        text = element_text(family = "Helvetica"),
        legend.position="none") +
  geom_label_repel(aes(x=size, y=model_result, col=rgb_col, label=geom_text_repel_label),
                   fill="white",
                   box.padding = 0.5, max.overlaps = Inf,
                   label.size = 0.75, 
                   size = 5,
                   alpha = 1, 
                   label.padding=0.25, 
                   fontface = "bold",
                   na.rm=TRUE,
                   seed = 1234)

plot_file = paste(plots_dir, "/comparison_models_pearson_pval_0.05_", dataset_selected, ".png", sep="")
print(comparison_plot)
## Warning: Removed 106 rows containing missing values (`geom_point()`).

ggsave(
  plot_file,
  plot=comparison_plot,
  dpi = 1200,
  #width = 6600,
  #height = 6700,
  units = c("px")
)
## Saving 8400 x 6000 px image
## Warning: Removed 106 rows containing missing values (`geom_point()`).

Comparison of power law with exponential decay:

datasets_selected = c("tcga:tcga-brca_female")

# Calculate data from scaling relationship
cols = c("size", 
         "S", 
         "type_dataset", 
         "S_lag1", 
         "Fn", 
         "model",
         "model_result",
         "slope", 
         "intercept", 
         "adj.r.squared")

scaling_relation_comparison_df = data.frame(matrix(ncol = length(cols),
                                                   nrow = 0,
                                                   dimnames = list(NULL, cols)))

for (dataset_selected in datasets_selected){
  scaling_relation_comp = topology_results_selected_by_size_df %>% 
    filter(type_dataset == dataset_selected) %>%
    select(size, mean, type_dataset) %>%
    unique() %>%
    rename("S" = "mean") %>%
    arrange(size) %>%
    mutate(S_lag1 = if_else(size == min(size), 0, lag(S))) %>%
    mutate(Fn = ((S - S_lag1) / S_lag1)) %>%
    filter((!(is.infinite(Fn))) & (Fn > 0))
  pl_summary = summary(lm(log(scaling_relation_comp$Fn)~log(scaling_relation_comp$size)))
  pl_pred = exp(log(scaling_relation_comp$size) * coef(pl_summary)[2] + coef(pl_summary)[1])
  ed_summary = summary(lm(scaling_relation_comp$Fn~scaling_relation_comp$size))
  ed_pred = scaling_relation_comp$size * coef(ed_summary)[2] + coef(ed_summary)[1]
  scaling_relation_comparison_df = rbind(scaling_relation_comparison_df, 
                                           cbind(scaling_relation_comp, 
                                                 data.frame(model = "Power law",
                                                            model_result = pl_pred),
                                                 data.frame(slope = coef(pl_summary)[2], 
                                                            intercept = coef(pl_summary)[1], 
                                                            adj.r.squared = pl_summary$adj.r.squared)))
  scaling_relation_comparison_df = rbind(scaling_relation_comparison_df, 
                                           cbind(scaling_relation_comp, 
                                                 data.frame(model = "Exponential decay",
                                                            model_result = ed_pred),
                                                 data.frame(slope = coef(ed_summary)[2], 
                                                            intercept = coef(ed_summary)[1], 
                                                            adj.r.squared = ed_summary$adj.r.squared)))
}

# Process results
scaling_relation_comparison_df = scaling_relation_comparison_df %>%
  filter(Fn > 0) %>%
  mutate_at(c('adj.r.squared'), ~sprintf("%.2f",.)) %>%
  mutate_at(c('adj.r.squared'), as.character) %>%
  mutate(type_dataset = replace(type_dataset, 
                                type_dataset == "tcga:tcga-brca_female", 
                                "TCGA: Breast cancer")) %>%
  mutate(type_dataset = replace(type_dataset, 
                                type_dataset == "gtex:breast.mammary.tissue_female", 
                                "GTEx: Breast")) %>%
  inner_join( (name2color %>% 
                 as.data.frame() %>% 
                 dplyr::rename("rgb_col"=".") %>% 
                 tibble::rownames_to_column("model")), 
              by=c("model") )

scaling_relation_comparison_df$r2labels = 
  ifelse(scaling_relation_comparison_df$model == "Power law", 
         paste("Power law: ", 
               unique((scaling_relation_comparison_df %>% 
                         filter(model == "Power law"))$adj.r.squared), 
               sep=""), 
         ifelse(scaling_relation_comparison_df$model == "Exponential decay", 
                paste("Exp. decay: ", 
                      unique((scaling_relation_comparison_df %>% 
                                filter(model == "Exponential decay"))$adj.r.squared), 
                      sep=""), 
                ""))

# Plot with labels
scaling_relation_plot_comparison = scaling_relation_comparison_df %>%
  ggplot(aes(x=size, y=Fn)) +
  geom_point(alpha=0.9, size=3, col="#0072B2") +
  #geom_abline(aes(intercept=intercept, slope=slope, col=model), size=1) +
  geom_line(aes(x= size, y=model_result, col=r2labels), size=1) +
  theme_linedraw() +
  xlab("n") +
  ylab(bquote(bold((S[n]-S[n-1])/S[n-1]))) +
  scale_color_manual(guide = 'legend', values=as.vector(name2color[levels(factor(scaling_relation_comparison_df$model))])) +
  #scale_colour_identity(guide = 'legend', labels=levels(factor(scaling_relation_comparison_df$r2labels))) +
  guides(col=guide_legend(title=bquote(bold(R^2)))) +
  theme(aspect.ratio=1, 
        plot.title =  element_text(size = 20, face="bold"), 
        axis.title = element_text(size = 17, face="bold"), 
        axis.text = element_text(size = 16), 
        panel.grid.major=element_blank(), 
        panel.grid.minor = element_blank(),
        text = element_text(family = "Helvetica"),
        #legend.position="none") +
        legend.position = c(0.64, 0.85),
        legend.text = element_text(size = 16), 
        legend.title=element_text(size=16, face="bold"),
        legend.background = element_rect(fill = 'transparent', color = "transparent"))
        #legend.background = element_rect(linewidth=0.2, linetype="solid", colour ="black"))

plot_file = paste(plots_dir, "/comparison_pl_ed_pearson_pval_0.05_", dataset_selected, ".png", sep="")
ggsave(
  plot_file,
  plot=scaling_relation_plot_comparison,
  dpi = 1200,
  #width = 9500,
  height = 6000,
  units = c("px")
)
## Saving 8400 x 6000 px image
print(scaling_relation_plot_comparison)

Demonstration of the model goodness-of-fit for 1 model:

models_selected = c("Stretched exponential (by linear fit)")
#dataset_selected = "gtex:whole.blood"
dataset_selected = "tcga:tcga-brca_female"
comparison_palette = name2color[c(dataset_selected, "Analytical model")]

# Join predicted results with mean
predicted_results_norm_and_mean_df <- 
  rbind((topology_results_selected_by_size_norm_df %>% 
           dplyr::select(type_dataset, size, mean_norm) %>% 
           unique() %>% 
           rename("model_result" = "mean_norm") %>% 
           mutate(model="Mean")), 
        (predicted_results_norm_df %>% 
           dplyr::select(type_dataset, size, model_result, model) %>% 
           unique() 
         %>% mutate_at(c('model_result'), as.numeric))) %>%
  filter(model %in% models_selected) %>%
  mutate(model = replace(model, 
                         model == "Stretched exponential (by linear fit)", 
                         "Analytical model"))
predicted_results_norm_and_mean_df$model <- factor(predicted_results_norm_and_mean_df$model, levels = c("Mean", "Analytical model", "Logarithm"))

# Process data
goodnessfit_df = results_selected_norm_df %>% 
  dplyr::select(type_dataset, size, rep, num_edges) %>%
  right_join(predicted_results_norm_and_mean_df, by=c("type_dataset", "size")) %>%
  filter(type_dataset == dataset_selected) %>%
  filter(size <= max((results_selected_norm_df %>% filter(type_dataset == dataset_selected))$size)) %>%
  filter(size >= min((results_selected_norm_df %>% filter(type_dataset == dataset_selected))$size)) %>%
  inner_join( (name2color %>% as.data.frame() %>% dplyr::rename("rgb_col"=".") %>% tibble::rownames_to_column("model") %>% mutate(model = replace(model, model == dataset_selected, "Mean"))), by=c("model") ) %>%
  mutate(geom_text_repel_label=dplyr::if_else(model == "Logarithm" & size == 280 & rep == 1, "Logarithm", dplyr::if_else(model == "Analytical model" & size == 760 & rep == 1, "Analytical model", dplyr::if_else(model == "Mean" & size == 560 & rep == 1, "Mean", ""))))

# Plot with labels
goodnessfit_plot = goodnessfit_df %>%
  ggplot(aes(x=size, y=num_edges)) +
  geom_point(alpha=0.6, size=3, col=comparison_palette[[dataset_selected]]) +
  geom_line(aes(x=size, y=model_result, col=rgb_col), size=1) +
  scale_colour_identity() +
  theme_linedraw() +
  xlab("Num. samples") +
  ylab("Frac. significant correlations") +
  theme(aspect.ratio=1, 
        plot.title =  element_text(size = 20, face="bold"), 
        axis.title = element_text(size = 17, face="bold"), 
        axis.text = element_text(size = 16), 
        panel.grid.major=element_blank(), 
        panel.grid.minor = element_blank(),
        text = element_text(family = "Helvetica"),
        legend.position="none") +
  geom_label_repel(aes(x=size, y=model_result, col=rgb_col, label=geom_text_repel_label),
                   fill="white",
                   box.padding = 0.5, max.overlaps = Inf,
                   label.size = 0.75, 
                   size = 5,
                   alpha = 1, 
                   label.padding=0.25, 
                   fontface = "bold",
                   na.rm=TRUE,
                   seed = 1234)

plot_file = paste(plots_dir, "/goodnessfit_model_pearson_pval_0.05_", dataset_selected, ".png", sep="")
print(goodnessfit_plot)
## Warning: Removed 53 rows containing missing values (`geom_point()`).

ggsave(
  plot_file,
  plot=goodnessfit_plot,
  dpi = 1200,
  #width = 6600,
  #height = 6700,
  units = c("px")
)
## Saving 8400 x 6000 px image
## Warning: Removed 53 rows containing missing values (`geom_point()`).

Demonstration of the model goodness-of-fit for 2 models:

datasets_selected = c("gtex:breast.mammary.tissue_female", 
                      "tcga:tcga-brca_female")
models_selected = c("Stretched exponential (by linear fit)")

# Join predicted results with mean
#predicted_results_norm_and_mean_df <- 
predicted_results_mean_df <- 
  #rbind((topology_results_selected_by_size_norm_df %>% 
  rbind((topology_results_selected_by_size_df %>% 
           #dplyr::select("type_dataset", "size", "mean_norm") %>% 
           dplyr::select("type_dataset", "size", "mean") %>% 
           unique() %>% 
           #rename("model_result" = "mean_norm") %>% 
           rename("model_result" = "mean") %>% 
           mutate(model="Mean")), 
        #(predicted_results_norm_df %>% 
        (predicted_results_df %>% 
           dplyr::select(type_dataset, size, model_result, model) %>% 
           unique() 
         %>% mutate_at(c('model_result'), as.numeric))) %>%
  filter(type_dataset %in% datasets_selected) %>%
  filter(model %in% models_selected) %>%
  inner_join((analytical_model_summary_df %>% 
                dplyr::select("type_dataset", "model", "adj.r.squared")),
             by = c("type_dataset", "model")) %>%
  mutate_at(c('adj.r.squared'), ~sprintf("%.2f",.)) %>%
  mutate_at(c('adj.r.squared'), as.character)

# Process data
#goodnessfit_lessdatasets_df = results_selected_norm_df %>% 
goodnessfit_lessdatasets_df = results_selected_df %>% 
  dplyr::select(type_dataset, size, rep, num_edges) %>%
  #right_join(predicted_results_norm_and_mean_df, by=c("type_dataset", "size")) %>%
  right_join(predicted_results_mean_df, by=c("type_dataset", "size")) %>%
  filter(type_dataset %in% datasets_selected) %>%
  #filter(size <= max((results_selected_norm_df %>% 
  filter(size <= max((results_selected_df %>% 
                        filter(type_dataset %in% datasets_selected))$size)) %>%
  #filter(size >= min((results_selected_norm_df %>% 
  filter(size >= min((results_selected_df %>% 
                        filter(type_dataset %in% datasets_selected))$size)) %>%
  filter(!(is.na(num_edges))) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:breast.mammary.tissue_female", "GTEx: Breast")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "tcga:tcga-brca_female", "TCGA: Breast cancer")) %>%
  inner_join( (name2color %>% as.data.frame() %>% dplyr::rename("rgb_col"=".") %>% tibble::rownames_to_column("type_dataset")), by=c("type_dataset") ) %>%
  mutate(geom_text_repel_label=dplyr::if_else(type_dataset == "R. arthritis" & size == 240 & rep == 1, "R. arthritis", dplyr::if_else(type_dataset == "GTEx: Whole blood" & size == 700 & rep == 1, "GTEx: Whole blood", dplyr::if_else(type_dataset == "GTEx: Lung" & size == 560 & rep == 1, "GTEx: Lung", dplyr::if_else(type_dataset == "GTEx: Breast" & size == 140 & rep == 1, "GTEx: Breast", dplyr::if_else(type_dataset == "TCGA: Lung cancer" & size == 460 & rep == 1, "TCGA: Lung cancer", dplyr::if_else(type_dataset == "TCGA: Breast cancer" & size == 1000 & rep == 1, "TCGA: Breast cancer", "")))))))

goodnessfit_lessdatasets_df$r2labels = 
  ifelse(goodnessfit_lessdatasets_df$type_dataset == "GTEx: Breast", 
         paste("GTEx Breast: ", 
               unique((goodnessfit_lessdatasets_df %>% 
                         filter(type_dataset == "GTEx: Breast"))$adj.r.squared), 
               sep=""), 
         ifelse(goodnessfit_lessdatasets_df$type_dataset == "TCGA: Breast cancer", 
                paste("TCGA Breast c.: ", 
                      unique((goodnessfit_lessdatasets_df %>% 
                                filter(type_dataset == "TCGA: Breast cancer"))$adj.r.squared), 
                      sep=""), 
                ""))

# Plot with labels
goodnessfit_plot_lessdatasets = goodnessfit_lessdatasets_df %>%
  ggplot(aes(x=size, y=num_edges, col=r2labels)) +
  geom_point(alpha=0.6, size=3) +
  geom_line(aes(x=size, y=model_result, col=r2labels), size=1) +
  #scale_colour_identity() +
  scale_color_manual(guide = 'legend', values=as.vector(name2color[levels(factor(goodnessfit_lessdatasets_df$type_dataset))])) +
  theme_linedraw() +
  xlab("Num. samples") +
  ylab("Num. significant correlations") +
  guides(col=guide_legend(title=bquote(bold(.('Analytical model') ~ R^2)))) +
  theme(aspect.ratio=1, 
        plot.title =  element_text(size = 20, face="bold"), 
        axis.title = element_text(size = 17, face="bold"), 
        axis.text = element_text(size = 16), 
        panel.grid.major=element_blank(), 
        panel.grid.minor = element_blank(),
        text = element_text(family = "Helvetica"),
        #legend.position="none") +
        legend.position = c(0.64, 0.15),
        legend.text = element_text(size = 16), 
        legend.title=element_text(size=16, face="bold"),
        #legend.background = element_rect(linewidth=0.2, linetype="solid", colour ="black")
        legend.background = element_rect(fill = "transparent", color = "transparent")
        )
  # geom_label_repel(aes(x=size, y=model_result, col=rgb_col, label=geom_text_repel_label),
  #                  fill="white",
  #                  box.padding = 0.5, max.overlaps = Inf,
  #                  label.size = 0.75, 
  #                  size = 5,
  #                  alpha = 1, 
  #                  label.padding=0.25, 
  #                  fontface = "bold",
  #                  na.rm=TRUE,
  #                  seed = 1234)

plot_file = paste(plots_dir, "/goodnessfit_2models_pearson_pval_0.05_", dataset_selected, ".png", sep="")
print(goodnessfit_plot_lessdatasets)

ggsave(
  plot_file,
  plot = goodnessfit_plot_lessdatasets,
  dpi = 1200,
  #width = 6600,
  #height = 6700,
  units = c("px")
)
## Saving 8400 x 6000 px image

a vs. fraction of significant correlations plot

Define functions:

#'  calculate_predictions_using_stretched_exponential_model_optimized
#'  Formula to calculate exponential decay of significant interactions from a list of sample sizes.
#'  This formula requires the use of the L parameter
#'  @param x List of sample sizes.
#'  @param a Slope coefficient.
#'  @param b Intercept coefficient.
#'  
calculate_predictions_using_stretched_exponential_model_optimized = function(x, L, a, b){
  y = L * exp((b * x ** (-a + 1)) / (-a + 1))
  #y = exp(log(L) - exp(b+a*log(x)))
  return(y)
}

#'  calculate_prediction_from_analytical_model
#'  Function to calculate predictions using a specific analytical model
#'  @param model Name of the model used.
#'  @param x_list List of values in the x axis (e.g. size)
#'  @param a Parameter a.
#'  @param b Parameter b.
#'  @param L Parameter L.
#'  
calculate_prediction_from_analytical_model = function(model, x_list, a, b, L){
  if(model == "Logarithmic"){
    prediction_result = (log(x_list) *a + b)
  } else if((model == "Stretched exponential (by optimization)") | (model == "Stretched exponential (by linear fit)")){
    prediction_result = calculate_predictions_using_stretched_exponential_model_optimized(x=x_list, L=L, a=a, b=b)
  } else if(model == "Stretched exponential (without L)"){
    prediction_result = calculate_predictions_using_stretched_exponential_model_without_L(x=x_list, a=a, b=b)
  }
  return(prediction_result)
}

Calculate predictions of the models:

#datasets_selected = c("gtex:whole.blood", "gtex:artery.tibial", "tcga:tcga-coad", "tcga:tcga-brca")
#datasets_selected = c("gtex:whole.blood", "gtex:artery.tibial", "tcga:tcga-coad", "tcga:tcga-brca", "scipher:scipher.complete.dataset")
#datasets_selected = c("gtex:breast.mammary.tissue", "gtex:thyroid", "tcga:tcga-brca", "tcga:tcga-thca", "scipher:scipher.complete.dataset")
#datasets_selected = c("gtex:breast.mammary.tissue", "gtex:thyroid", "gtex:whole.blood", "tcga:tcga-brca", "tcga:tcga-thca", "scipher:scipher.complete.dataset")
datasets_selected = c("gtex:breast.mammary.tissue_female", 
                      "gtex:lung", 
                      "gtex:whole.blood", 
                      "tcga:tcga-brca_female", 
                      "tcga:tcga-luad", 
                      "scipher:scipher.sample.per.patient.baseline")
#model_selected = "Stretched exponential (by optimization)"
model_selected = "Stretched exponential (by linear fit)"
sample_size_vs_a_df = analytical_model_summary_df %>% 
  inner_join(sample_size_correlation_df, 
             by = c("type_dataset" = "dataset")) %>% 
  #filter((type_dataset %in% datasets_selected) & (model == model_selected)) 
  filter(model == model_selected)

# Calculate number of edges for each critical correlation using model
sample_size_vs_a_df$num_edges_from_statistical_corrected = 
  calculate_prediction_from_analytical_model(model = model_selected, 
                                             x_list = sample_size_vs_a_df$sample_size_statistical_corrected, 
                                             a = sample_size_vs_a_df$a, 
                                             b = sample_size_vs_a_df$b, 
                                             L = sample_size_vs_a_df$L)
sample_size_vs_a_df$num_edges_from_statistical_corrected = 
  sample_size_vs_a_df$num_edges_from_statistical_corrected * sample_size_vs_a_df$max_value_in_dataset
sample_size_vs_a_df$num_edges_from_statistical_corrected_norm = 
  sample_size_vs_a_df$num_edges_from_statistical_corrected / sample_size_vs_a_df$total_num_edges

# sample_size_vs_a_df = sample_size_vs_a_df %>% 
#   tidyr::separate(type_dataset, 
#                   into = c("dataset", NA), 
#                   sep = ":", 
#                   remove = FALSE) %>%
#   tidyr::separate(type_dataset, 
#                   into = c("dataset_name", "sex"), 
#                   sep = "_", 
#                   remove = TRUE) %>%
#   mutate(subclassification = 
#            if_else(dataset == "tcga", 
#                   "tumor", 
#                   "")) %>%
#   mutate(dataset_name = 
#            if_else(dataset == "tcga", 
#                    paste(dataset_name, 
#                          subclassification, 
#                          sep = "-"), 
#                    dataset_name)) %>%
#   mutate(dataset_name = 
#            if_else(!(is.na(sex)), 
#                    paste(dataset_name, 
#                          sex, 
#                          sep = "_"), 
#                    dataset_name))

sample_size_vs_a_df %>% head(10)
##                                   type_dataset
## 1             scipher:scipher.complete.dataset
## 2             scipher:scipher.complete.dataset
## 3             scipher:scipher.complete.dataset
## 4             scipher:scipher.complete.dataset
## 5  scipher:scipher.sample.per.patient.baseline
## 6  scipher:scipher.sample.per.patient.baseline
## 7  scipher:scipher.sample.per.patient.baseline
## 8  scipher:scipher.sample.per.patient.baseline
## 9                   gtex:breast.mammary.tissue
## 10                  gtex:breast.mammary.tissue
##                                    model        a        b        L
## 1  Stretched exponential (by linear fit) 1.790429 42.98998 1.462534
## 2  Stretched exponential (by linear fit) 1.790429 42.98998 1.462534
## 3  Stretched exponential (by linear fit) 1.790429 42.98998 1.462534
## 4  Stretched exponential (by linear fit) 1.790429 42.98998 1.462534
## 5  Stretched exponential (by linear fit) 1.758283 41.83477 2.332343
## 6  Stretched exponential (by linear fit) 1.758283 41.83477 2.332343
## 7  Stretched exponential (by linear fit) 1.758283 41.83477 2.332343
## 8  Stretched exponential (by linear fit) 1.758283 41.83477 2.332343
## 9  Stretched exponential (by linear fit) 1.720270 43.15181 2.114016
## 10 Stretched exponential (by linear fit) 1.720270 43.15181 2.114016
##    max_value_in_dataset
## 1              65309202
## 2              65309202
## 3              65309202
## 4              65309202
## 5              40568917
## 6              40568917
## 7              40568917
## 8              40568917
## 9              74740207
## 10             74740207
##                                                         formula adj.r.squared
## 1  F(x) = 1.46 * exp[(42.99 * x**((1.79) + 1)) / ((1.79) + 1) ]     0.9868786
## 2  F(x) = 1.46 * exp[(42.99 * x**((1.79) + 1)) / ((1.79) + 1) ]     0.9868786
## 3  F(x) = 1.46 * exp[(42.99 * x**((1.79) + 1)) / ((1.79) + 1) ]     0.9868786
## 4  F(x) = 1.46 * exp[(42.99 * x**((1.79) + 1)) / ((1.79) + 1) ]     0.9868786
## 5  F(x) = 2.33 * exp[(41.83 * x**((1.76) + 1)) / ((1.76) + 1) ]     0.9908833
## 6  F(x) = 2.33 * exp[(41.83 * x**((1.76) + 1)) / ((1.76) + 1) ]     0.9908833
## 7  F(x) = 2.33 * exp[(41.83 * x**((1.76) + 1)) / ((1.76) + 1) ]     0.9908833
## 8  F(x) = 2.33 * exp[(41.83 * x**((1.76) + 1)) / ((1.76) + 1) ]     0.9908833
## 9  F(x) = 2.11 * exp[(43.15 * x**((1.72) + 1)) / ((1.72) + 1) ]     0.9871514
## 10 F(x) = 2.11 * exp[(43.15 * x**((1.72) + 1)) / ((1.72) + 1) ]     0.9871514
##    relative.error.mean  unnorm_L total_num_edges density subclassification
## 1            0.1392092  95516931        95516931       1                  
## 2            0.1392092  95516931        95516931       1                  
## 3            0.1392092  95516931        95516931       1                  
## 4            0.1392092  95516931        95516931       1                  
## 5            0.1386634  94620646        94620646       1                  
## 6            0.1386634  94620646        94620646       1                  
## 7            0.1386634  94620646        94620646       1                  
## 8            0.1386634  94620646        94620646       1                  
## 9            0.1889067 158001976       158001976       1                  
## 10           0.1889067 158001976       158001976       1                  
##    L_vs_total                                dataset_name dataset sex
## 1           0            scipher:scipher.complete.dataset scipher    
## 2           0            scipher:scipher.complete.dataset scipher    
## 3           0            scipher:scipher.complete.dataset scipher    
## 4           0            scipher:scipher.complete.dataset scipher    
## 5           0 scipher:scipher.sample.per.patient.baseline scipher    
## 6           0 scipher:scipher.sample.per.patient.baseline scipher    
## 7           0 scipher:scipher.sample.per.patient.baseline scipher    
## 8           0 scipher:scipher.sample.per.patient.baseline scipher    
## 9           0                  gtex:breast.mammary.tissue    gtex    
## 10          0                  gtex:breast.mammary.tissue    gtex    
##    type_correlation correlation sample_size_statistical
## 1              weak         0.2               68.773025
## 2          moderate         0.4               18.002295
## 3            strong         0.6                8.523611
## 4       very strong         0.8                5.063346
## 5              weak         0.2               68.773025
## 6          moderate         0.4               18.002295
## 7            strong         0.6                8.523611
## 8       very strong         0.8                5.063346
## 9              weak         0.2               68.773025
## 10         moderate         0.4               18.002295
##    sample_size_statistical_corrected num_edges_from_statistical_corrected
## 1                          914.63989                             74520763
## 2                          216.05522                             43934688
## 3                           85.91378                             19096937
## 4                           38.90078                              4702026
## 5                          914.18952                             69148495
## 6                          215.94977                             37083170
## 7                           85.87258                             14367043
## 8                           38.88278                              3042116
## 9                          938.69085                            102474141
## 10                         221.68638                             46440188
##    num_edges_from_statistical_corrected_norm
## 1                                 0.78018380
## 2                                 0.45996754
## 3                                 0.19993248
## 4                                 0.04922715
## 5                                 0.73079712
## 6                                 0.39191415
## 7                                 0.15183835
## 8                                 0.03215065
## 9                                 0.64856240
## 10                                0.29392157

Save the predictions:

type_correlation_selected = "weak"

sample_size_vs_a_df %>%
  filter(type_correlation == type_correlation_selected) %>%
  arrange(desc(a))
##                                   type_dataset
## 1                 gse193677:rectum_uc_inflamed
## 2                                    tcga:lung
## 3         gse193677:rectum_control_noninflamed
## 4                 gse193677:rectum_cd_inflamed
## 5              gtex:skin.sun.exposed.lower.leg
## 6                           tcga:breast_female
## 7            gtex:breast.mammary.tissue_female
## 8             scipher:scipher.complete.dataset
## 9                                  tcga:kidney
## 10 scipher:scipher.sample.per.patient.baseline
## 11                                gtex:thyroid
## 12                  gtex:breast.mammary.tissue
## 13                              tcga:tcga-kirp
## 14                                   gtex:lung
## 15                            gtex:whole.blood
## 16                              tcga:tcga-luad
## 17                              tcga:tcga-thca
## 18                              tcga:tcga-kirc
## 19                              tcga:tcga-lusc
## 20                              tcga:brca.luma
## 21                       tcga:tcga-brca_female
## 22                              tcga:brca.lumb
##                                    model        a         b         L
## 1  Stretched exponential (by linear fit) 2.198803 214.47311  1.902216
## 2  Stretched exponential (by linear fit) 2.183346 177.44983  2.056570
## 3  Stretched exponential (by linear fit) 1.942484  96.46064  1.913428
## 4  Stretched exponential (by linear fit) 1.898637  79.73148  4.555803
## 5  Stretched exponential (by linear fit) 1.836919  69.54141  1.443081
## 6  Stretched exponential (by linear fit) 1.836527  71.30603  5.974945
## 7  Stretched exponential (by linear fit) 1.811947  64.89496  3.885985
## 8  Stretched exponential (by linear fit) 1.790429  42.98998  1.462534
## 9  Stretched exponential (by linear fit) 1.771018  52.59051  5.738395
## 10 Stretched exponential (by linear fit) 1.758283  41.83477  2.332343
## 11 Stretched exponential (by linear fit) 1.746114  46.00874  1.669286
## 12 Stretched exponential (by linear fit) 1.720270  43.15181  2.114016
## 13 Stretched exponential (by linear fit) 1.668704  38.50169  3.923615
## 14 Stretched exponential (by linear fit) 1.654318  28.07262  1.964261
## 15 Stretched exponential (by linear fit) 1.650687  21.33857  1.586486
## 16 Stretched exponential (by linear fit) 1.649890  37.43779  2.749527
## 17 Stretched exponential (by linear fit) 1.610579  27.46556  2.737581
## 18 Stretched exponential (by linear fit) 1.565623  23.34566  3.378756
## 19 Stretched exponential (by linear fit) 1.549679  25.93423  4.415126
## 20 Stretched exponential (by linear fit) 1.528906  21.43228  4.180236
## 21 Stretched exponential (by linear fit) 1.478753  17.41578  3.599061
## 22 Stretched exponential (by linear fit) 1.441041  14.12650 19.001680
##    max_value_in_dataset
## 1              87033411
## 2              89070336
## 3              86523419
## 4              36339651
## 5             109822205
## 6              32891554
## 7              42299605
## 8              65309202
## 9              32872637
## 10             40568917
## 11             95645379
## 12             74740207
## 13             43843097
## 14             82744314
## 15             72312862
## 16             67033504
## 17             64735459
## 18             55176637
## 19             43223551
## 20             44003689
## 21             50774187
## 22              9515666
##                                                          formula adj.r.squared
## 1     F(x) = 1.9 * exp[(214.47 * x**((2.2) + 1)) / ((2.2) + 1) ]     0.9735496
## 2  F(x) = 2.06 * exp[(177.45 * x**((2.18) + 1)) / ((2.18) + 1) ]     0.9824779
## 3   F(x) = 1.91 * exp[(96.46 * x**((1.94) + 1)) / ((1.94) + 1) ]     0.9962786
## 4     F(x) = 4.56 * exp[(79.73 * x**((1.9) + 1)) / ((1.9) + 1) ]     0.9791491
## 5   F(x) = 1.44 * exp[(69.54 * x**((1.84) + 1)) / ((1.84) + 1) ]     0.9921006
## 6   F(x) = 5.97 * exp[(71.31 * x**((1.84) + 1)) / ((1.84) + 1) ]     0.9939891
## 7   F(x) = 3.89 * exp[(64.89 * x**((1.81) + 1)) / ((1.81) + 1) ]     0.9963943
## 8   F(x) = 1.46 * exp[(42.99 * x**((1.79) + 1)) / ((1.79) + 1) ]     0.9868786
## 9   F(x) = 5.74 * exp[(52.59 * x**((1.77) + 1)) / ((1.77) + 1) ]     0.9917808
## 10  F(x) = 2.33 * exp[(41.83 * x**((1.76) + 1)) / ((1.76) + 1) ]     0.9908833
## 11  F(x) = 1.67 * exp[(46.01 * x**((1.75) + 1)) / ((1.75) + 1) ]     0.9898823
## 12  F(x) = 2.11 * exp[(43.15 * x**((1.72) + 1)) / ((1.72) + 1) ]     0.9871514
## 13   F(x) = 3.92 * exp[(38.5 * x**((1.67) + 1)) / ((1.67) + 1) ]     0.9948093
## 14  F(x) = 1.96 * exp[(28.07 * x**((1.65) + 1)) / ((1.65) + 1) ]     0.9838666
## 15  F(x) = 1.59 * exp[(21.34 * x**((1.65) + 1)) / ((1.65) + 1) ]     0.9832295
## 16  F(x) = 2.75 * exp[(37.44 * x**((1.65) + 1)) / ((1.65) + 1) ]     0.9979574
## 17  F(x) = 2.74 * exp[(27.47 * x**((1.61) + 1)) / ((1.61) + 1) ]     0.9858457
## 18  F(x) = 3.38 * exp[(23.35 * x**((1.57) + 1)) / ((1.57) + 1) ]     0.9956269
## 19  F(x) = 4.42 * exp[(25.93 * x**((1.55) + 1)) / ((1.55) + 1) ]     0.9853471
## 20  F(x) = 4.18 * exp[(21.43 * x**((1.53) + 1)) / ((1.53) + 1) ]     0.9977903
## 21   F(x) = 3.6 * exp[(17.42 * x**((1.48) + 1)) / ((1.48) + 1) ]     0.9974590
## 22    F(x) = 19 * exp[(14.13 * x**((1.44) + 1)) / ((1.44) + 1) ]     0.9857087
##    relative.error.mean  unnorm_L total_num_edges density
## 1           0.52545708 165556306       165556306       1
## 2           0.25787164 183179370       183179370       1
## 3           0.10243415 165556306       165556306       1
## 4           0.23290398 165556306       165556306       1
## 5           0.13088726 158482306       158482306       1
## 6           0.10866531 196525225       196525225       1
## 7           0.30863165 164375646       164375646       1
## 8           0.13920918  95516931        95516931       1
## 9           0.17617336 188636176       188636176       1
## 10          0.13866336  94620646        94620646       1
## 11          0.14868418 159659515       159659515       1
## 12          0.18890669 158001976       158001976       1
## 13          0.16316561 172023426       172023426       1
## 14          0.28316122 162531435       162531435       1
## 15          0.16228112 114723378       114723378       1
## 16          0.08379174 184310400       184310400       1
## 17          0.16020709 177218551       177218551       1
## 18          0.08005273 186428395       186428395       1
## 19          0.25501658 190837416       190837416       1
## 20          0.06539910 183945790       183945790       1
## 21          0.06618629 182739403       182739403       1
## 22          0.20213896 180813636       180813636       1
##              subclassification L_vs_total
## 1  disease_inflamed_vs_control          0
## 2                       tissue          0
## 3  disease_inflamed_vs_control          0
## 4  disease_inflamed_vs_control          0
## 5                                       0
## 6                       tissue          0
## 7                                       0
## 8                                       0
## 9                       tissue          0
## 10                                      0
## 11                                      0
## 12                                      0
## 13                       tumor          0
## 14                                      0
## 15                                      0
## 16                       tumor          0
## 17                       tumor          0
## 18                       tumor          0
## 19                       tumor          0
## 20                     subtype          0
## 21                       tumor          0
## 22                     subtype          0
##                                            dataset_name   dataset     sex
## 1       gse193677:rectum-disease_inflamed_vs_control_uc gse193677      uc
## 2                                      tcga:lung-tissue      tcga        
## 3  gse193677:rectum-disease_inflamed_vs_control_control gse193677 control
## 4       gse193677:rectum-disease_inflamed_vs_control_cd gse193677      cd
## 5                       gtex:skin.sun.exposed.lower.leg      gtex        
## 6                             tcga:breast-tissue_female      tcga  female
## 7                     gtex:breast.mammary.tissue_female      gtex  female
## 8                      scipher:scipher.complete.dataset   scipher        
## 9                                    tcga:kidney-tissue      tcga        
## 10          scipher:scipher.sample.per.patient.baseline   scipher        
## 11                                         gtex:thyroid      gtex        
## 12                           gtex:breast.mammary.tissue      gtex        
## 13                                 tcga:tcga-kirp-tumor      tcga        
## 14                                            gtex:lung      gtex        
## 15                                     gtex:whole.blood      gtex        
## 16                                 tcga:tcga-luad-tumor      tcga        
## 17                                 tcga:tcga-thca-tumor      tcga        
## 18                                 tcga:tcga-kirc-tumor      tcga        
## 19                                 tcga:tcga-lusc-tumor      tcga        
## 20                               tcga:brca.luma-subtype      tcga        
## 21                          tcga:tcga-brca-tumor_female      tcga  female
## 22                               tcga:brca.lumb-subtype      tcga        
##    type_correlation correlation sample_size_statistical
## 1              weak         0.2                68.77302
## 2              weak         0.2                68.77302
## 3              weak         0.2                68.77302
## 4              weak         0.2                68.77302
## 5              weak         0.2                68.77302
## 6              weak         0.2                68.77302
## 7              weak         0.2                68.77302
## 8              weak         0.2                68.77302
## 9              weak         0.2                68.77302
## 10             weak         0.2                68.77302
## 11             weak         0.2                68.77302
## 12             weak         0.2                68.77302
## 13             weak         0.2                68.77302
## 14             weak         0.2                68.77302
## 15             weak         0.2                68.77302
## 16             weak         0.2                68.77302
## 17             weak         0.2                68.77302
## 18             weak         0.2                68.77302
## 19             weak         0.2                68.77302
## 20             weak         0.2                68.77302
## 21             weak         0.2                68.77302
## 22             weak         0.2                68.77302
##    sample_size_statistical_corrected num_edges_from_statistical_corrected
## 1                           940.9234                            157679996
## 2                           945.7592                            175094031
## 3                           940.9234                            140899038
## 4                           940.9234                            137078882
## 5                           938.8359                            120953460
## 6                           949.1215                            149208667
## 7                           940.5813                            120811364
## 8                           914.6399                             74520763
## 9                           947.1626                            133474106
## 10                          914.1895                             69148495
## 11                          939.1897                            109922481
## 12                          938.6909                            102474141
## 13                          942.7552                             95305292
## 14                          940.0419                             99917128
## 15                          923.3936                             78008662
## 16                          946.0535                             94258238
## 17                          944.1776                             89214478
## 18                          946.5998                             79233906
## 19                          947.7173                             64143529
## 20                          945.9588                             62419293
## 21                          945.6442                             46512537
## 22                          945.1377                             37979699
##    num_edges_from_statistical_corrected_norm
## 1                                  0.9524252
## 2                                  0.9558611
## 3                                  0.8510642
## 4                                  0.8279895
## 5                                  0.7631985
## 6                                  0.7592342
## 7                                  0.7349712
## 8                                  0.7801838
## 9                                  0.7075743
## 10                                 0.7307971
## 11                                 0.6884806
## 12                                 0.6485624
## 13                                 0.5540251
## 14                                 0.6147557
## 15                                 0.6799718
## 16                                 0.5114103
## 17                                 0.5034150
## 18                                 0.4250099
## 19                                 0.3361161
## 20                                 0.3393353
## 21                                 0.2545293
## 22                                 0.2100489
a_vs_fraction_corr_file = paste(tables_dir, 
                                "a_vs_fraction_sig_correlations_pearson_pval_0.05.txt", 
                                sep = "/")

sample_size_vs_a_df %>% 
  arrange(desc(a)) %>% 
  fwrite(a_vs_fraction_corr_file)

Plot alpha vs. fraction of correlations above 0.2:

# Process data
datasets_selected <- c("scipher:scipher.sample.per.patient.baseline",
                       "gtex:breast.mammary.tissue_female",
                       "gtex:lung",
                       "gtex:skin.sun.exposed.lower.leg",
                       "gtex:thyroid",
                       "gtex:whole.blood",
                       "tcga:tcga-brca-tumor_female",
                       "tcga:tcga-kirc-tumor",
                       "tcga:tcga-kirp-tumor",
                       "tcga:tcga-luad-tumor",
                       "tcga:tcga-lusc-tumor",
                       "tcga:tcga-thca-tumor",
                       "tcga:breast-tissue_female", 
                       "tcga:kidney-tissue",
                       "tcga:lung-tissue",
                       "tcga:brca.luma-subtype",
                       "tcga:brca.lumb-subtype")

sample_size_vs_a_plot_df = sample_size_vs_a_df %>% 
  filter(type_correlation == type_correlation_selected) %>% 
  filter(dataset_name %in% datasets_selected) %>%
  mutate(dataset_name = 
           replace(dataset_name, 
                   dataset_name == "scipher:scipher.sample.per.patient.baseline", 
                   "R. arthritis")) %>%
  mutate(dataset_name = 
           replace(dataset_name, 
                   dataset_name == "gtex:whole.blood",
                   "GTEx: Whole blood")) %>%
  #mutate(dataset_name = replace(dataset_name, dataset_name == "gtex:thyroid", "GTEx: Thyroid")) %>%
  mutate(dataset_name = 
           replace(dataset_name, 
                   dataset_name == "gtex:lung", 
                   "GTEx: Lung")) %>%
  #mutate(dataset_name = replace(dataset_name, dataset_name == "gtex:artery.tibial", "GTEx: Artery tibial")) %>%
  mutate(dataset_name = 
           replace(dataset_name, 
                   dataset_name == "gtex:breast.mammary.tissue_female", 
                   "GTEx: Breast")) %>%
  #mutate(dataset_name = replace(dataset_name, dataset_name == "tcga:tcga-coad", "TCGA: Colon cancer")) %>%
  #mutate(dataset_name = replace(dataset_name, dataset_name == "tcga:tcga-thca", "TCGA: Thyroid cancer")) %>%
  mutate(dataset_name = 
           replace(dataset_name, 
                   dataset_name == "tcga:tcga-luad-tumor", 
                   "TCGA: Lung cancer")) %>%
  mutate(dataset_name = 
           replace(dataset_name, 
                   dataset_name == "tcga:tcga-brca-tumor_female", 
                   "TCGA: Breast cancer")) %>%
  left_join( (name2color %>% 
                as.data.frame() %>% 
                dplyr::rename("rgb_col" = ".") %>% 
                tibble::rownames_to_column("dataset_name")), 
             by = c("dataset_name") ) %>%
  mutate(rgb_col = replace(rgb_col, is.na(rgb_col), "#000000")) %>% # Non-selected datasets with color black
  mutate(geom_text_repel_label=dplyr::if_else(dataset_name == "R. arthritis", "R. arthritis", 
                                              dplyr::if_else(dataset_name == "GTEx: Whole blood", "GTEx: Whole blood", 
                                                             dplyr::if_else(dataset_name == "GTEx: Lung", "GTEx: Lung", 
                                                                            dplyr::if_else(dataset_name == "GTEx: Breast", "GTEx: Breast", 
                                                                                           dplyr::if_else(dataset_name == "TCGA: Lung cancer", "TCGA: Lung cancer", 
                                                                                                          dplyr::if_else(dataset_name == "TCGA: Breast cancer", "TCGA: Breast cancer", "")))))))

# Calculate correlation
ss_vs_a_cor <- cor(sample_size_vs_a_plot_df$a, sample_size_vs_a_plot_df$num_edges_from_statistical_corrected_norm)

# Calculate regression
lm_summary <- summary(lm(sample_size_vs_a_plot_df$num_edges_from_statistical_corrected_norm ~ sample_size_vs_a_plot_df$a))
slope <- coef(lm_summary)[2]
intercept <- coef(lm_summary)[1]
adj.r.squared <- lm_summary$adj.r.squared
lm_cor_result <- paste("R² =", 
                       round(adj.r.squared, 3), 
                       "\ncorr. =", 
                       round(ss_vs_a_cor, 3))


# Make scatter plot
a_vs_fraction_sig_correlations_plot <- sample_size_vs_a_plot_df %>%
    ggplot(aes(x=a, y=num_edges_from_statistical_corrected_norm)) + 
    geom_point(aes(col=rgb_col), show.legend = FALSE) +
    scale_colour_identity() +
    geom_label_repel(aes(col = rgb_col, label = geom_text_repel_label),
                     fill="white",
                     box.padding = 0.5, max.overlaps = Inf,
                     label.size = 0.75, 
                     size = 5,
                     alpha = 0.6, 
                     label.padding=0.25, 
                     fontface = "bold",
                     na.rm = TRUE,
                     seed = 5555) +
    new_scale_color() +
    geom_abline(aes(slope = slope,
                    intercept = intercept,
                    col = lm_cor_result),
              linetype = "dashed", 
              show.legend = FALSE) +
  scale_color_manual(values = c("gray50")) +
  annotate("text", x = 2.07, y = 0.77, label = lm_cor_result, size = 5) +
  xlab(expression(alpha)) +
  ylab("Frac. correlations above 0.2") +
  theme_linedraw() +
  theme(aspect.ratio = 1, 
        plot.title = element_text(size = 20, face = "bold"), 
        axis.title = element_text(size = 17, face = "bold"), 
        axis.text = element_text(size = 16), 
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        text = element_text(family = "Helvetica"),
        legend.position="none")
        #legend.position = c(0.70, 0.25),
        #legend.text = element_text(size = 14), 
        #legend.title=element_blank())

print(a_vs_fraction_sig_correlations_plot)

plot_file <- paste(plots_dir,
                   "a_vs_fraction_sig_correlations_pearson_pval_0.05.png",
                   sep="/")
ggsave(
  plot_file,
  plot=a_vs_fraction_sig_correlations_plot,
  dpi = 1200,
  #width = 6400,
  #height = 6000,
  units = c("px")
)
## Saving 8400 x 6000 px image

Less datasets:

# Process data
datasets_selected <- c("scipher:scipher.sample.per.patient.baseline",
                       "gtex:breast.mammary.tissue_female",
                       "gtex:lung",
                       "gtex:skin.sun.exposed.lower.leg",
                       "gtex:thyroid",
                       "gtex:whole.blood",
                       "tcga:tcga-brca-tumor_female",
                       "tcga:tcga-kirc-tumor",
                       "tcga:tcga-kirp-tumor",
                       "tcga:tcga-luad-tumor",
                       "tcga:tcga-lusc-tumor",
                       "tcga:tcga-thca-tumor",
                       "tcga:breast-tissue_female", 
                       "tcga:kidney-tissue",
                       "tcga:lung-tissue",
                       "tcga:brca.luma-subtype",
                       "tcga:brca.lumb-subtype")

sample_size_vs_a_plot_lessdatasets_df = sample_size_vs_a_df %>% 
  filter(type_correlation == type_correlation_selected) %>% 
  filter(dataset_name %in% datasets_selected) %>%
  mutate(dataset = replace(dataset, dataset == "scipher", "R. arthritis"),
         dataset = replace(dataset, dataset == "gtex", "GTEx"),
         dataset = replace(dataset, dataset == "tcga", "TCGA")) %>%
  mutate(dataset_name = 
           replace(dataset_name, 
                   dataset_name == "gtex:breast.mammary.tissue_female", 
                   "GTEx: Breast")) %>%
  mutate(dataset_name = 
           replace(dataset_name, 
                   dataset_name == "tcga:tcga-brca-tumor_female", 
                   "TCGA: Breast cancer")) %>%
  left_join( (name2color %>% 
                as.data.frame() %>% 
                dplyr::rename("rgb_col" = ".") %>% 
                tibble::rownames_to_column("dataset")# %>% 
                #filter(dataset_name %in% c("TCGA: Breast cancer", "GTEx: Breast"))
              ), 
             by = c("dataset") ) %>%
  mutate(rgb_col = replace(rgb_col, is.na(rgb_col), "#000000")) %>% # Non-selected datasets with color black
  mutate(geom_text_repel_label=dplyr::if_else(dataset_name == "R. arthritis", "R. arthritis", 
                                              dplyr::if_else(dataset_name == "GTEx: Whole blood", "GTEx: Whole blood", 
                                                             dplyr::if_else(dataset_name == "GTEx: Lung", "GTEx: Lung", 
                                                                            dplyr::if_else(dataset_name == "GTEx: Breast", "GTEx: Breast", 
                                                                                           dplyr::if_else(dataset_name == "TCGA: Lung cancer", "TCGA: Lung cancer", 
                                                                                                          dplyr::if_else(dataset_name == "TCGA: Breast cancer", "TCGA: Breast cancer", "")))))))

# Calculate correlation
a_vs_fr_cor <- cor(sample_size_vs_a_plot_lessdatasets_df$a, sample_size_vs_a_plot_lessdatasets_df$num_edges_from_statistical_corrected_norm)

# Calculate regression
lm_summary_avsfr <- summary(lm(sample_size_vs_a_plot_lessdatasets_df$num_edges_from_statistical_corrected_norm ~ sample_size_vs_a_plot_lessdatasets_df$a))
slope_avsfr <- coef(lm_summary_avsfr)[2]
intercept_avsfr <- coef(lm_summary_avsfr)[1]
adj.r.squared_avsfr <- lm_summary_avsfr$adj.r.squared
lm_cor_result_avsfr <- paste("R² =", 
                       round(adj.r.squared_avsfr, 3), 
                       "\ncorr. =", 
                       round(a_vs_fr_cor, 3))


# Make scatter plot
a_vs_fraction_sig_correlations_lessdatasets_plot <- sample_size_vs_a_plot_lessdatasets_df %>%
    ggplot(aes(x=a, y=num_edges_from_statistical_corrected_norm)) + 
    geom_point(aes(col=rgb_col), alpha = 0.6, size = 3, show.legend = TRUE) +
    #scale_colour_identity() +
    scale_colour_identity(labels = unique(sample_size_vs_a_plot_lessdatasets_df$dataset), breaks = unique(sample_size_vs_a_plot_lessdatasets_df$rgb_col), guide = "legend") +
    geom_label_repel(aes(col = rgb_col, label = geom_text_repel_label),
                     fill="white",
                     box.padding = 0.5, max.overlaps = Inf,
                     label.size = 0.75, 
                     size = 5,
                     alpha = 0.6, 
                     label.padding = 0.25, 
                     fontface = "bold",
                     na.rm = TRUE,
                     show.legend = FALSE,
                     max.iter = 1e5,
                     seed = 5555) +
    geom_abline(aes(slope = slope_avsfr,
                    intercept = intercept_avsfr),
                linetype = "dashed", 
                col = "gray50",
                show.legend = FALSE) +
  annotate("text", x = 2.00, y = 0.45, label = lm_cor_result_avsfr, size = 5) +
  xlab(expression(alpha)) +
  ylab("Frac. correlations above 0.2") +
  theme_linedraw() +
  theme(aspect.ratio = 1, 
        plot.title = element_text(size = 20, face = "bold"), 
        axis.title = element_text(size = 17, face = "bold"), 
        axis.text = element_text(size = 16), 
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        text = element_text(family = "Helvetica"),
        #legend.position="none")
        legend.position = c(0.2, 0.9),
        legend.background = element_rect(fill = 'transparent', color = "transparent"),
        legend.text = element_text(size = 16), 
        legend.title=element_blank())

print(a_vs_fraction_sig_correlations_lessdatasets_plot)

plot_file <- paste(plots_dir,
                   "a_vs_fraction_sig_correlations_pearson_pval_0.05_lessdatasets.png",
                   sep="/")
ggsave(
  plot_file,
  plot=a_vs_fraction_sig_correlations_lessdatasets_plot,
  dpi = 1200,
  #width = 6400,
  #height = 6000,
  units = c("px")
)
## Saving 8400 x 6000 px image

Variation plot

Create plots showing the variation of each gene across samples:

datasets_selected = c("gtex:breast.mammary.tissue_female", 
                      "gtex:lung", 
                      "gtex:whole.blood", 
                      "tcga:tcga-brca-tumor_female", 
                      "tcga:tcga-luad-tumor", 
                      "scipher:scipher.sample.per.patient.baseline")
parameters = c("sd", "mean", "cv")
parameter2label <- list("sd"="SD", "mean"="mean", "cv"="CV")
type_counts_selected = "reads"
distribution_plots = list()
sd_genes_filtered = sd_genes_df %>%
  filter(dataset_name %in% datasets_selected) %>%
  filter(type_counts == type_counts_selected) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "scipher.sample.per.patient.baseline", "R. arthritis")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "Whole.Blood", "GTEx: Whole blood")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "Lung", "GTEx: Lung")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "Breast.Mammary.Tissue", "GTEx: Breast")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "TCGA-LUAD", "TCGA: Lung cancer")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "TCGA-BRCA", "TCGA: Breast cancer")) %>%
  inner_join( (name2color %>% as.data.frame() %>% dplyr::rename("rgb_col"=".") %>% tibble::rownames_to_column("type_dataset")), by=c("type_dataset") )

print(unique(sd_genes_filtered$dataset_name)) # test to check if datasets are selected correctly
## [1] "gtex:breast.mammary.tissue_female"          
## [2] "gtex:lung"                                  
## [3] "gtex:whole.blood"                           
## [4] "tcga:tcga-brca-tumor_female"                
## [5] "tcga:tcga-luad-tumor"                       
## [6] "scipher:scipher.sample.per.patient.baseline"
for (x in 1:length(parameters)){
  parameter = parameters[x]
  
  # # Plot with legend
  # distribution_plot = sd_genes_filtered %>%
  #   ggplot(aes(.data[[parameter]], color=type_dataset)) +
  #   geom_freqpoly(size=1.5) +
  #   scale_color_manual(values = c("#D55E00", "#E69F00", "#44AA99", "#65F3BF", "#0072B2", "#56B4E9")) +
  #   scale_x_log10() + 
  #   scale_y_log10() + 
  #   labs(x=paste("Gene expression ", parameter2label[[parameter]], " (Log)", sep=""), y = "Number of genes (Log)") +
  #   theme_linedraw() + 
  #   guides(col=guide_legend(title="Dataset")) +
  #   theme(aspect.ratio=1, plot.title =  element_text(size = 20, face="bold"), axis.title = element_text(size = 17, face="bold"), axis.text = element_text(size = 16), legend.text = element_text(size = 16), legend.title=element_text(size=16, face="bold"))

  # Plot without legend
  distribution_plot = sd_genes_filtered %>%
    ggplot(aes(.data[[parameter]], color=rgb_col)) +
    geom_freqpoly(size=1.5) +
    scale_colour_identity() +
    scale_x_log10() + 
    scale_y_log10() + 
    labs(x=paste("Gene expression ", parameter2label[[parameter]], " (Log)", sep=""), y = "Number of genes (Log)") +
    theme_linedraw() + 
    guides(col=guide_legend(title="Dataset")) +
    theme(aspect.ratio=1, 
          plot.title =  element_text(size = 20, face="bold"), 
          axis.title = element_text(size = 17, face="bold"), 
          axis.text = element_text(size = 16), 
          panel.grid.major=element_blank(), 
          panel.grid.minor = element_blank(),
          text = element_text(family = "Helvetica"),
          legend.position="none")
  
  # Save distribution plot without changes in the theme, because we will tune it afterwards with patchwork
  distribution_plots[[x]] = distribution_plot
  
  plot_file = paste(plots_dir, "/distribution_", parameter, "_genes.png", sep="")
  ggsave(
    plot_file,
    plot=distribution_plot,
    dpi = 1200,
    #width = 10000,
    #height = 6000,
    units = c("px")
  )
  print(distribution_plot)
}
## Saving 8400 x 6000 px image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous y-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous y-axis
## Saving 8400 x 6000 px image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous y-axis

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous y-axis
## Saving 8400 x 6000 px image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous y-axis

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous y-axis

Less datasets:

datasets_selected = c("gtex:breast.mammary.tissue_female", 
                      "tcga:tcga-brca-tumor_female")
parameters = c("sd", "mean", "cv")
parameter2label <- list("sd"="SD", "mean"="mean", "cv"="CV")
type_counts_selected = "reads"
distribution_plots_lessdatasets = list()
sd_genes_filtered = sd_genes_df %>%
  filter(dataset_name %in% datasets_selected) %>%
  filter(type_counts == type_counts_selected) %>%
  mutate(type_dataset = replace(type_dataset, 
                                type_dataset == "Breast.Mammary.Tissue", 
                                "GTEx: Breast")) %>%
  mutate(type_dataset = replace(type_dataset, 
                                type_dataset == "TCGA-BRCA", 
                                "TCGA: Breast cancer")) %>%
  inner_join( (name2color %>%
                 as.data.frame() %>%
                 dplyr::rename("rgb_col"=".") %>%
                 tibble::rownames_to_column("type_dataset")), 
              by=c("type_dataset") )

print(unique(sd_genes_filtered$dataset_name)) # test to check if datasets are selected correctly
## [1] "gtex:breast.mammary.tissue_female" "tcga:tcga-brca-tumor_female"
for (x in 1:length(parameters)){
  parameter = parameters[x]

  # Plot without legend
  distribution_plot_lessdatasets = sd_genes_filtered %>%
    ggplot(aes(.data[[parameter]], color=rgb_col)) +
    geom_freqpoly(size=1.5) +
    scale_colour_identity() +
    scale_x_log10() + 
    scale_y_log10() + 
    labs(x=paste("Gene expression ", 
                 parameter2label[[parameter]], 
                 " (Log)", sep=""), 
         y = "Number of genes (Log)") +
    theme_linedraw() + 
    guides(col=guide_legend(title="Dataset")) +
    theme(aspect.ratio=1, 
          plot.title =  element_text(size = 20, face="bold"), 
          axis.title = element_text(size = 17, face="bold"), 
          axis.text = element_text(size = 16), 
          panel.grid.major=element_blank(), 
          panel.grid.minor = element_blank(),
          text = element_text(family = "Helvetica"),
          legend.position="none")
  
  # Save distribution plot without changes in the theme, because we will tune it 
  # afterwards with patchwork
  distribution_plots_lessdatasets[[x]] = distribution_plot_lessdatasets
  
  plot_file = paste(plots_dir, 
                    "/distribution_", 
                    parameter, 
                    "_genes_lessdatasets.png", 
                    sep="")
  ggsave(
    plot_file,
    plot=distribution_plot_lessdatasets,
    dpi = 1200,
    #width = 10000,
    #height = 6000,
    units = c("px")
  )
  print(distribution_plot_lessdatasets)
}
## Saving 8400 x 6000 px image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous y-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous y-axis
## Saving 8400 x 6000 px image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous y-axis

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous y-axis
## Saving 8400 x 6000 px image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous y-axis

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous y-axis

Show numbers:

a_vs_fraction_corr_file = 
  paste(tables_dir, 
        "a_vs_fraction_sig_correlations_pearson_pval_0.05.txt", 
        sep='/')
a_vs_fraction_corr_df = fread(a_vs_fraction_corr_file)
head(a_vs_fraction_corr_df)
##                    type_dataset                                 model        a
## 1: gse193677:rectum_uc_inflamed Stretched exponential (by linear fit) 2.198803
## 2: gse193677:rectum_uc_inflamed Stretched exponential (by linear fit) 2.198803
## 3: gse193677:rectum_uc_inflamed Stretched exponential (by linear fit) 2.198803
## 4: gse193677:rectum_uc_inflamed Stretched exponential (by linear fit) 2.198803
## 5:                    tcga:lung Stretched exponential (by linear fit) 2.183346
## 6:                    tcga:lung Stretched exponential (by linear fit) 2.183346
##           b        L max_value_in_dataset
## 1: 214.4731 1.902216             87033411
## 2: 214.4731 1.902216             87033411
## 3: 214.4731 1.902216             87033411
## 4: 214.4731 1.902216             87033411
## 5: 177.4498 2.056570             89070336
## 6: 177.4498 2.056570             89070336
##                                                          formula adj.r.squared
## 1:    F(x) = 1.9 * exp[(214.47 * x**((2.2) + 1)) / ((2.2) + 1) ]     0.9735496
## 2:    F(x) = 1.9 * exp[(214.47 * x**((2.2) + 1)) / ((2.2) + 1) ]     0.9735496
## 3:    F(x) = 1.9 * exp[(214.47 * x**((2.2) + 1)) / ((2.2) + 1) ]     0.9735496
## 4:    F(x) = 1.9 * exp[(214.47 * x**((2.2) + 1)) / ((2.2) + 1) ]     0.9735496
## 5: F(x) = 2.06 * exp[(177.45 * x**((2.18) + 1)) / ((2.18) + 1) ]     0.9824779
## 6: F(x) = 2.06 * exp[(177.45 * x**((2.18) + 1)) / ((2.18) + 1) ]     0.9824779
##    relative.error.mean  unnorm_L total_num_edges density
## 1:           0.5254571 165556306       165556306       1
## 2:           0.5254571 165556306       165556306       1
## 3:           0.5254571 165556306       165556306       1
## 4:           0.5254571 165556306       165556306       1
## 5:           0.2578716 183179370       183179370       1
## 6:           0.2578716 183179370       183179370       1
##              subclassification L_vs_total
## 1: disease_inflamed_vs_control          0
## 2: disease_inflamed_vs_control          0
## 3: disease_inflamed_vs_control          0
## 4: disease_inflamed_vs_control          0
## 5:                      tissue          0
## 6:                      tissue          0
##                                       dataset_name   dataset sex
## 1: gse193677:rectum-disease_inflamed_vs_control_uc gse193677  uc
## 2: gse193677:rectum-disease_inflamed_vs_control_uc gse193677  uc
## 3: gse193677:rectum-disease_inflamed_vs_control_uc gse193677  uc
## 4: gse193677:rectum-disease_inflamed_vs_control_uc gse193677  uc
## 5:                                tcga:lung-tissue      tcga    
## 6:                                tcga:lung-tissue      tcga    
##    type_correlation correlation sample_size_statistical
## 1:             weak         0.2               68.773025
## 2:         moderate         0.4               18.002295
## 3:           strong         0.6                8.523611
## 4:      very strong         0.8                5.063346
## 5:             weak         0.2               68.773025
## 6:         moderate         0.4               18.002295
##    sample_size_statistical_corrected num_edges_from_statistical_corrected
## 1:                         940.92339                            157679996
## 2:                         222.20909                            125753013
## 3:                          88.31798                             72105576
## 4:                          39.95108                             19259015
## 5:                         945.75920                            175094031
## 6:                         223.34132                            142792927
##    num_edges_from_statistical_corrected_norm
## 1:                                 0.9524252
## 2:                                 0.7595785
## 3:                                 0.4355351
## 4:                                 0.1163291
## 5:                                 0.9558611
## 6:                                 0.7795252
sd_cut <- 10
mean_cut <- 10
cv_cut <- 50

sd_genes_df %>% 
  filter(type_counts == type_counts_selected) %>% 
  group_by(dataset_name, subclassification) %>% 
  summarize(mean_sd = mean(sd), 
            median_sd = median(sd), 
            mean_mean = mean(mean), 
            median_mean = median(mean), 
            mean_cv = mean(cv), 
            median_cv = median(cv)) %>%
  head(10)
## `summarise()` has grouped output by 'dataset_name'. You can override using the
## `.groups` argument.
## # A tibble: 10 × 8
## # Groups:   dataset_name [10]
##    dataset_name          subcl…¹ mean_sd media…² mean_…³ media…⁴ mean_cv media…⁵
##    <chr>                 <chr>     <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##  1 gtex:breast.mammary.… ""        1852.    515.   3383.   1015.    78.3    53.2
##  2 gtex:breast.mammary.… ""        1711.    465.   3366.   1022.    68.6    48.7
##  3 gtex:lung             ""        1815.    537.   3269.    974.    75.4    54.9
##  4 gtex:skin.sun.expose… ""        1638.    442.   3405.    908.    71.2    48.2
##  5 gtex:thyroid          ""        1546.    476.   3175.   1033.    68.1    45.4
##  6 gtex:whole.blood      ""        2400.    335.   3268.    403.   105.     78.8
##  7 scipher:scipher.comp… ""         429.    111.    972.    269.    49.1    42.6
##  8 scipher:scipher.comp… ""         448.    116.   1007.    280.    48.6    42.8
##  9 scipher:scipher.comp… ""         437.    111.    990.    277.    48.0    41.2
## 10 scipher:scipher.samp… ""         394.    104.    953.    261.    46.5    40.7
## # … with abbreviated variable names ¹​subclassification, ²​median_sd, ³​mean_mean,
## #   ⁴​median_mean, ⁵​median_cv
# Calculate number of genes above certain SD, mean and CV cut-offs
sd_genes_vs_a_df = sd_genes_df %>% 
  filter(type_counts == type_counts_selected) %>% 
  group_by(dataset, 
           type_dataset, 
           dataset_name, 
           type_counts, 
           sex, 
           subclassification) %>% 
  summarize(num_genes_above_sd_cut = length(sd[sd >= sd_cut]), 
            num_genes_above_mean_cut = length(mean[mean >= mean_cut]), 
            num_genes_above_cv_cut = length(cv[cv >= cv_cut]), 
            num_genes = n()) %>% 
  ungroup() %>% 
  mutate(fraction_genes_above_sd_cut = num_genes_above_sd_cut / num_genes, 
         fraction_genes_above_mean_cut = num_genes_above_mean_cut / num_genes, 
         fraction_genes_above_cv_cut = num_genes_above_cv_cut / num_genes) %>% 
  inner_join((a_vs_fraction_corr_df %>% select(-type_dataset)), 
             by = c("dataset_name", "dataset", "sex", "subclassification")) %>% 
  select(!c("type_correlation", 
            "correlation", 
            "sample_size_statistical", 
            "sample_size_statistical_corrected", 
            "num_edges_from_statistical_corrected", 
            "num_edges_from_statistical_corrected_norm")) %>% 
  unique() %>%
  as.data.frame()
## `summarise()` has grouped output by 'dataset', 'type_dataset', 'dataset_name',
## 'type_counts', 'sex'. You can override using the `.groups` argument.
head(sd_genes_vs_a_df)
##   dataset               type_dataset                      dataset_name
## 1    gtex      Breast.Mammary.Tissue        gtex:breast.mammary.tissue
## 2    gtex      Breast.Mammary.Tissue gtex:breast.mammary.tissue_female
## 3    gtex                       Lung                         gtex:lung
## 4    gtex Skin.Sun.Exposed.Lower.leg   gtex:skin.sun.exposed.lower.leg
## 5    gtex                    Thyroid                      gtex:thyroid
## 6    gtex                Whole.Blood                  gtex:whole.blood
##   type_counts    sex subclassification num_genes_above_sd_cut
## 1       reads                                           17266
## 2       reads female                                    17455
## 3       reads                                           17469
## 4       reads                                           17083
## 5       reads                                           17197
## 6       reads                                           14927
##   num_genes_above_mean_cut num_genes_above_cv_cut num_genes
## 1                    17407                   9530     17777
## 2                    17684                   8796     18132
## 3                    17623                  10459     18030
## 4                    17279                   8427     17804
## 5                    17411                   7840     17870
## 6                    14868                  14607     15148
##   fraction_genes_above_sd_cut fraction_genes_above_mean_cut
## 1                   0.9712550                     0.9791866
## 2                   0.9626627                     0.9752923
## 3                   0.9688852                     0.9774265
## 4                   0.9595035                     0.9705122
## 5                   0.9623391                     0.9743145
## 6                   0.9854106                     0.9815157
##   fraction_genes_above_cv_cut                                 model        a
## 1                   0.5360860 Stretched exponential (by linear fit) 1.720270
## 2                   0.4851092 Stretched exponential (by linear fit) 1.811947
## 3                   0.5800887 Stretched exponential (by linear fit) 1.654318
## 4                   0.4733206 Stretched exponential (by linear fit) 1.836919
## 5                   0.4387241 Stretched exponential (by linear fit) 1.746114
## 6                   0.9642857 Stretched exponential (by linear fit) 1.650687
##          b        L max_value_in_dataset
## 1 43.15181 2.114016             74740207
## 2 64.89496 3.885985             42299605
## 3 28.07262 1.964261             82744314
## 4 69.54141 1.443081            109822205
## 5 46.00874 1.669286             95645379
## 6 21.33857 1.586486             72312862
##                                                        formula adj.r.squared
## 1 F(x) = 2.11 * exp[(43.15 * x**((1.72) + 1)) / ((1.72) + 1) ]     0.9871514
## 2 F(x) = 3.89 * exp[(64.89 * x**((1.81) + 1)) / ((1.81) + 1) ]     0.9963943
## 3 F(x) = 1.96 * exp[(28.07 * x**((1.65) + 1)) / ((1.65) + 1) ]     0.9838666
## 4 F(x) = 1.44 * exp[(69.54 * x**((1.84) + 1)) / ((1.84) + 1) ]     0.9921006
## 5 F(x) = 1.67 * exp[(46.01 * x**((1.75) + 1)) / ((1.75) + 1) ]     0.9898823
## 6 F(x) = 1.59 * exp[(21.34 * x**((1.65) + 1)) / ((1.65) + 1) ]     0.9832295
##   relative.error.mean  unnorm_L total_num_edges density L_vs_total
## 1           0.1889067 158001976       158001976       1          0
## 2           0.3086317 164375646       164375646       1          0
## 3           0.2831612 162531435       162531435       1          0
## 4           0.1308873 158482306       158482306       1          0
## 5           0.1486842 159659515       159659515       1          0
## 6           0.1622811 114723378       114723378       1          0

Plot different parameters (sd, mean, cv) vs. alpha:

parameters = c("sd", "mean", "cv")
parameter2label <- list("sd"="SD", "mean"="mean", "cv"="CV")
datasets_selected <- c("scipher:scipher.sample.per.patient.baseline",
                       "gtex:breast.mammary.tissue_female",
                       "gtex:lung",
                       "gtex:skin.sun.exposed.lower.leg",
                       "gtex:thyroid",
                       "gtex:whole.blood",
                       "tcga:tcga-brca-tumor_female",
                       "tcga:tcga-kirc-tumor",
                       "tcga:tcga-kirp-tumor",
                       "tcga:tcga-luad-tumor",
                       "tcga:tcga-lusc-tumor",
                       "tcga:tcga-thca-tumor",
                       "tcga:breast-tissue_female", 
                       "tcga:kidney-tissue",
                       "tcga:lung-tissue",
                       "tcga:brca.luma-subtype",
                       "tcga:brca.lumb-subtype")

# Prepare table for plots
sd_genes_vs_a_filtered = sd_genes_vs_a_df %>%
  filter(type_counts == type_counts_selected) %>%
  filter(dataset_name %in% datasets_selected) %>%
  mutate(dataset_name = 
           replace(dataset_name,
                   dataset_name == "scipher:scipher.sample.per.patient.baseline", 
                   "R. arthritis")) %>%
  mutate(dataset_name = 
           replace(dataset_name, 
                   dataset_name == "gtex:whole.blood", 
                   "GTEx: Whole blood")) %>%
  mutate(dataset_name = 
           replace(dataset_name, 
                   dataset_name == "gtex:lung", 
                   "GTEx: Lung")) %>%
  mutate(dataset_name = 
           replace(dataset_name, 
                   dataset_name == "gtex:breast.mammary.tissue_female", 
                   "GTEx: Breast")) %>%
  mutate(dataset_name = 
           replace(dataset_name, 
                   dataset_name == "tcga:tcga-luad-tumor", 
                   "TCGA: Lung cancer")) %>%
  mutate(dataset_name = 
           replace(dataset_name, 
                   dataset_name == "tcga:tcga-brca-tumor_female", 
                   "TCGA: Breast cancer")) %>%
  left_join( (name2color %>% 
                as.data.frame() %>% 
                dplyr::rename("rgb_col" = ".") %>% 
                tibble::rownames_to_column("dataset_name")), 
             by = c("dataset_name") ) %>%
  mutate(rgb_col = 
           replace(rgb_col, 
                   is.na(rgb_col), 
                   "#000000")) %>% # Non-selected datasets with color black
  mutate(geom_text_repel_label = 
           dplyr::if_else(dataset_name == "R. arthritis", 
                          "R. arthritis", 
                          dplyr::if_else(dataset_name == "GTEx: Whole blood", 
                                         "GTEx: Whole blood", 
                                         dplyr::if_else(dataset_name == "GTEx: Lung", 
                                                        "GTEx: Lung", dplyr::if_else(dataset_name == "GTEx: Breast", 
                                                                                     "GTEx: Breast", 
                                                                                     dplyr::if_else(dataset_name == "TCGA: Lung cancer",
                                                                                                    "TCGA: Lung cancer", 
                                                                                                    dplyr::if_else(dataset_name == "TCGA: Breast cancer", "TCGA: Breast cancer", ""))))))) %>% 
  as.data.frame()
scatter_plots = list()

print(unique(sd_genes_vs_a_filtered$dataset_name)) # test to check if datasets are selected correctly
##  [1] "GTEx: Breast"                    "GTEx: Lung"                     
##  [3] "gtex:skin.sun.exposed.lower.leg" "gtex:thyroid"                   
##  [5] "GTEx: Whole blood"               "R. arthritis"                   
##  [7] "tcga:brca.luma-subtype"          "tcga:brca.lumb-subtype"         
##  [9] "tcga:breast-tissue_female"       "tcga:kidney-tissue"             
## [11] "tcga:lung-tissue"                "TCGA: Breast cancer"            
## [13] "tcga:tcga-kirc-tumor"            "tcga:tcga-kirp-tumor"           
## [15] "TCGA: Lung cancer"               "tcga:tcga-lusc-tumor"           
## [17] "tcga:tcga-thca-tumor"
# Make plot for each type of parameter (SD, mean, CV)
for (x in 1:length(parameters)){
  parameter = parameters[x]
  if (parameter == "cv"){
    cut <- cv_cut
    x_legend <- 1.55
    y_legend <- 0.45
  } else if (parameter == "sd"){
    cut <- sd_cut
    x_legend <- 1.585
    y_legend <- 0.938
  } else if (parameter == "mean"){
    cut <- mean_cut
    x_legend <- 1.585
    y_legend <- 0.968
  }

  # Calculate correlation
  ss_vs_a_cor <- cor(sd_genes_vs_a_filtered$a, 
                     sd_genes_vs_a_filtered[[paste("fraction_genes_above_",
                                                   parameter,
                                                   "_cut",
                                                   sep = "")]])
  
  # Calculate regression
  lm_summary <- summary(lm(sd_genes_vs_a_filtered[[paste("fraction_genes_above_",
                                                   parameter,
                                                   "_cut",
                                                   sep = "")]] ~ 
                             sd_genes_vs_a_filtered$a))
  slope <- coef(lm_summary)[2]
  intercept <- coef(lm_summary)[1]
  adj.r.squared <- lm_summary$adj.r.squared
  lm_cor_result <- paste("R² =", 
                         round(adj.r.squared, 3), 
                         "\ncorr. =", 
                         round(ss_vs_a_cor, 3))

  param_vs_a_plot = sd_genes_vs_a_filtered %>%
    ggplot(aes(x = a,
               y = .data[[paste("fraction_genes_above_", 
                                parameter, 
                                "_cut", 
                                sep = "")]])) + 
    geom_point(aes(col=rgb_col), show.legend = FALSE) +
    scale_colour_identity() +
    geom_label_repel(aes(col = rgb_col, 
                         label = geom_text_repel_label),
                     fill = "white",
                     box.padding = 0.5, 
                     max.overlaps = Inf,
                     label.size = 0.75, 
                     size = 5,
                     alpha = 0.6, 
                     label.padding=0.25, 
                     fontface = "bold",
                     na.rm = TRUE,
                     seed = 1234) +
    new_scale_color() +
    geom_abline(aes(slope = slope,
                    intercept = intercept,
                    col = lm_cor_result),
              linetype = "dashed", 
              show.legend = FALSE) +
    scale_color_manual(values = c("gray50")) +
    annotate("text", x = x_legend, y = y_legend, label = lm_cor_result, size = 5) +
    xlab(expression(alpha)) +
    ylab(paste("Frac. genes with ", 
               parameter2label[[parameter]], 
               " above ", 
               cut, 
               sep = "")) +
    theme_linedraw() +
    theme(aspect.ratio = 1, 
          plot.title =  element_text(size = 20, face="bold"), 
          axis.title = element_text(size = 17, face="bold"), 
          axis.text = element_text(size = 16), 
          panel.grid.major=element_blank(), 
          panel.grid.minor = element_blank(),
          text = element_text(family = "Helvetica"),
          legend.position="none")
          #legend.position = c(0.70, 0.25),
          #legend.text = element_text(size = 14), 
          #legend.title=element_blank())

  plot_file = paste(plots_dir, "/a_vs_fraction_genes_above_", parameter, "_cut.png", sep="")
  ggsave(
    plot_file,
    dpi = 1200,
    width = 6000,
    #height = 6000,
    units = c("px")
  )
  print(param_vs_a_plot)
  scatter_plots[[x]] = param_vs_a_plot
}
## Saving 6000 x 6000 px image
## Saving 6000 x 6000 px image

## Saving 6000 x 6000 px image

With less datasets:

parameters = c("sd", "mean", "cv")
parameter2label <- list("sd"="SD", "mean"="mean", "cv"="CV")
datasets_selected <- c("scipher:scipher.sample.per.patient.baseline",
                       "gtex:breast.mammary.tissue_female",
                       "gtex:lung",
                       "gtex:skin.sun.exposed.lower.leg",
                       "gtex:thyroid",
                       "gtex:whole.blood",
                       "tcga:tcga-brca-tumor_female",
                       "tcga:tcga-kirc-tumor",
                       "tcga:tcga-kirp-tumor",
                       "tcga:tcga-luad-tumor",
                       "tcga:tcga-lusc-tumor",
                       "tcga:tcga-thca-tumor",
                       "tcga:breast-tissue_female", 
                       "tcga:kidney-tissue",
                       "tcga:lung-tissue",
                       "tcga:brca.luma-subtype",
                       "tcga:brca.lumb-subtype")

# Prepare table for plots
sd_genes_vs_a_filtered = sd_genes_vs_a_df %>%
  filter(dataset_name %in% datasets_selected) %>%
  mutate(dataset_name = 
           replace(dataset_name, 
                   dataset_name == "gtex:breast.mammary.tissue_female", 
                   "GTEx: Breast")) %>%
  mutate(dataset_name = 
           replace(dataset_name, 
                   dataset_name == "tcga:tcga-brca-tumor_female", 
                   "TCGA: Breast cancer")) %>%
  left_join( (name2color %>% 
                as.data.frame() %>% 
                dplyr::rename("rgb_col"=".") %>% 
                tibble::rownames_to_column("dataset") #%>% 
                #filter(dataset_name %in% c("TCGA: Breast cancer", "GTEx: Breast"))
              ), 
             by=c("dataset") ) %>%
  mutate(rgb_col = replace(rgb_col, 
                           is.na(rgb_col), 
                           "#000000")) %>% # Non-selected datasets with color black
  mutate(geom_text_repel_label = 
           dplyr::if_else(dataset_name == "R. arthritis", 
                          "R. arthritis", 
                          dplyr::if_else(dataset_name == "GTEx: Whole blood", 
                                         "GTEx: Whole blood", 
                                         dplyr::if_else(dataset_name == "GTEx: Lung", 
                                                        "GTEx: Lung", 
                                                        dplyr::if_else(dataset_name == "GTEx: Breast", 
                                                                       "GTEx: Breast", dplyr::if_else(dataset_name == "TCGA: Lung cancer", 
                                                                                                      "TCGA: Lung cancer", 
                                                                                                      dplyr::if_else(dataset_name == "TCGA: Breast cancer", 
                                                                                                                     "TCGA: Breast cancer", 
                                                                                                                     ""))))))) %>% 
  as.data.frame()

scatter_plots_lessdatasets = list()
lm_summary_avspar = list()
lm_cor_result_avspar = list()

for (x in 1:length(parameters)){
  parameter = parameters[x]
  if (parameter == "cv"){
    cut <- cv_cut
    x_legend <- 2
    y_legend <- 0.38
  } else if (parameter == "sd"){
    cut <- sd_cut
    x_legend <- 1.585
    y_legend <- 0.938
  } else if (parameter == "mean"){
    cut <- mean_cut
    x_legend <- 1.585
    y_legend <- 0.968
  }

  # Calculate correlation
  a_vs_par_cor <- cor(sd_genes_vs_a_filtered$a, 
                     sd_genes_vs_a_filtered[[paste("fraction_genes_above_",
                                                   parameter,
                                                   "_cut",
                                                   sep = "")]])
  
  # Calculate regression
  lm_summary_avspar[[x]] <- summary(lm(sd_genes_vs_a_filtered[[paste("fraction_genes_above_",
                                                   parameter,
                                                   "_cut",
                                                   sep = "")]] ~ 
                             sd_genes_vs_a_filtered$a))
  lm_cor_result_avspar[[x]] <- paste("R² =", 
                         round(lm_summary_avspar[[x]]$adj.r.squared, 3), 
                         "\ncorr. =", 
                         round(a_vs_par_cor, 3))
  
  scatter_plots_lessdatasets[[x]] <- sd_genes_vs_a_filtered %>%
    ggplot(aes(x = a, 
               y = .data[[paste("fraction_genes_above_", 
                                parameter, 
                                "_cut", 
                                sep = "")]], 
               col = rgb_col)) + 
    geom_point(alpha = 0.6, size = 3) +
    scale_colour_identity() +
    geom_label_repel(aes(col=rgb_col, label=geom_text_repel_label),
                     fill="white",
                     max.iter = 1e6,
                     label.size = 0.75, 
                     size = 5,
                     alpha = 0.6, 
                     box.padding = 0.5,
                     label.padding = 0.25, 
                     fontface = "bold",
                     na.rm=TRUE,
                     seed = 1234) +
    geom_abline(aes(slope = coef(lm_summary_avspar[[x]])[2],
                  intercept = coef(lm_summary_avspar[[x]])[1]),
              col = "gray50",
              linetype = "dashed", 
              show.legend = FALSE) +
    annotate("text", x = x_legend, y = y_legend, label = lm_cor_result_avspar[[x]], size = 5) +
    xlab(expression(alpha)) +
    ylab(paste("Frac. genes with ", 
               parameter2label[[parameter]], 
               " above ", 
               cut, 
               sep = "")) +
    theme_linedraw() +
    theme(aspect.ratio = 1, 
          plot.title =  element_text(size = 20, face="bold"), 
          axis.title = element_text(size = 17, face="bold"), 
          axis.text = element_text(size = 16), 
          panel.grid.major=element_blank(), 
          panel.grid.minor = element_blank(),
          text = element_text(family = "Helvetica"),
          legend.position="none")

  plot_file = paste(plots_dir, 
                    "/a_vs_fraction_genes_above_", 
                    parameter, 
                    "_cut_lessdatasets.png", 
                    sep = "")
  ggsave(
    plot_file,
    plot = scatter_plots_lessdatasets[[x]],
    dpi = 1200,
    width = 6000,
    #height = 6000,
    units = c("px")
  )
  print(scatter_plots_lessdatasets[[x]])
}
## Saving 6000 x 6000 px image
## Saving 6000 x 6000 px image

## Saving 6000 x 6000 px image

Make figure for Supplementary Information:

layout <- "
ABC
DFG
"

((distribution_plots_lessdatasets[[2]] + labs(tag = 'A')) +
  (distribution_plots_lessdatasets[[1]] + labs(tag = 'B')) +
  (distribution_plots_lessdatasets[[3]] + labs(tag = 'C')) +
  ((scatter_plots_lessdatasets[[2]] + 
    geom_abline(aes(slope = coef(lm_summary_avspar[[2]])[2],
                  intercept = coef(lm_summary_avspar[[2]])[1]),
              col = "gray50",
              linetype = "dashed", 
              show.legend = FALSE)) + labs(tag = 'D')) +
  ((scatter_plots_lessdatasets[[1]]  + 
    geom_abline(aes(slope = coef(lm_summary_avspar[[1]])[2],
                  intercept = coef(lm_summary_avspar[[1]])[1]),
              col = "gray50",
              linetype = "dashed", 
              show.legend = FALSE)) + labs(tag = 'E')) +
  (scatter_plots_lessdatasets[[3]] + labs(tag = 'F'))) +
  plot_layout(design = layout) &
  theme(plot.tag = element_text(face = 'bold', size = 17),
        plot.tag.position  = c(.1, 1.02),
        plot.margin = margin(0.6, 0.6, 0.6, 0.6, "cm"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous y-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous y-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: ggrepel: 2 unlabeled data points (too many overlaps). Consider increasing max.overlaps
## ggrepel: 2 unlabeled data points (too many overlaps). Consider increasing max.overlaps

plot_file = paste(plots_dir, "/cv_results_SI.png", sep="")
ggsave(
  plot_file,
  dpi = 1200,
  width = 17500,
  #height = 12500,
  height = 12000,
  units = c("px")
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous y-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous y-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous y-axis

CV vs. sample size

We calculate the sample size necessary to reveal 50% of the significant correlations:

calculate_optimal_N <- function(L, a, b, s, N_guess=c(10000)) {
  optimize_N = function(par){
    N = par[1]
    f = s - L * exp((b * N ** (-a + 1)) / (-a + 1))
    return((abs(f)))
  }
  res = optim(par=N_guess, fn=optimize_N, method="Brent", lower=3, upper=50000)
  return(res$par)
}
model_selected = "Stretched exponential (by linear fit)"
cols = c("dataset_name", "s", "a", "N_opt")
fraction_links <- 0.5

N_opt_df = data.frame(matrix(ncol=length(cols),nrow=0, dimnames=list(NULL, cols)))
for (dataset_selected in unique(analytical_model_summary_df$dataset_name)){
  analytical_params_selected = analytical_model_summary_df %>%
    filter((dataset_name == dataset_selected) & (model == model_selected))
  N_opt <- calculate_optimal_N(L = analytical_params_selected$L, 
                               a = analytical_params_selected$a,
                               b = analytical_params_selected$b, 
                               s = fraction_links * analytical_params_selected$L)
                               # Why multiply 0.5 x L? Because the normalized result (e.g., 0.5) is the result of:
                               # 0.5 = (s * max_value_in_dataset / max_value_in_dataset) / L = s / L
                               # --> s = 0.5 * L
  N_opt_df = rbind(N_opt_df, 
                   data.frame(dataset_name = dataset_selected, 
                              s = fraction_links, 
                              a = analytical_params_selected$a, 
                              N_opt = N_opt))
}

N_opt_file = paste(tables_dir,
                   "/optimal_sample_size_",
                   fraction_links,
                   "links",
                   sep = "")

N_opt_df %>% 
  fwrite(N_opt_file)

N_opt_df
##                                            dataset_name   s        a      N_opt
## 1                      scipher:scipher.complete.dataset 0.5 1.790429  249.47471
## 2           scipher:scipher.sample.per.patient.baseline 0.5 1.758283  321.23471
## 3                            gtex:breast.mammary.tissue 0.5 1.720270  488.45289
## 4                     gtex:breast.mammary.tissue_female 0.5 1.811947  346.25815
## 5                                             gtex:lung 0.5 1.654318  547.29617
## 6                       gtex:skin.sun.exposed.lower.leg 0.5 1.836919  304.64639
## 7                                          gtex:thyroid 0.5 1.746114  409.71393
## 8                                      gtex:whole.blood 0.5 1.650687  375.10439
## 9                           tcga:tcga-brca-tumor_female 0.5 1.478753 3914.54702
## 10                                 tcga:tcga-kirc-tumor 0.5 1.565623 1373.65244
## 11                                 tcga:tcga-kirp-tumor 0.5 1.668704  741.92255
## 12                                 tcga:tcga-luad-tumor 0.5 1.649890  899.08309
## 13                                 tcga:tcga-lusc-tumor 0.5 1.549679 2160.52992
## 14                                 tcga:tcga-thca-tumor 0.5 1.610579  929.03972
## 15                            tcga:breast-tissue_female 0.5 1.836527  314.92674
## 16                                   tcga:kidney-tissue 0.5 1.771018  384.51866
## 17                                     tcga:lung-tissue 0.5 2.183346   94.04511
## 18                               tcga:brca.luma-subtype 0.5 1.528906 2190.78289
## 19                               tcga:brca.lumb-subtype 0.5 1.441041 5950.41238
## 20      gse193677:rectum-disease_inflamed_vs_control_cd 0.5 1.898637  221.26127
## 21 gse193677:rectum-disease_inflamed_vs_control_control 0.5 1.942484  200.27661
## 22      gse193677:rectum-disease_inflamed_vs_control_uc 0.5 2.198803  102.76362

Then, we match the optimal sample size to obtain 50% of significant correlations with the CV of the dataset.

sd_cut <- 10
mean_cut <- 10
cv_cut <- 50

datasets_selected <- c("scipher:scipher.sample.per.patient.baseline",
                       "gtex:breast.mammary.tissue_female",
                       "gtex:lung",
                       "gtex:skin.sun.exposed.lower.leg",
                       "gtex:thyroid",
                       "gtex:whole.blood",
                       "tcga:tcga-brca-tumor_female",
                       "tcga:tcga-kirc-tumor",
                       "tcga:tcga-kirp-tumor",
                       "tcga:tcga-luad-tumor",
                       "tcga:tcga-lusc-tumor",
                       "tcga:tcga-thca-tumor",
                       "tcga:breast-tissue_female", 
                       "tcga:kidney-tissue",
                       "tcga:lung-tissue",
                       "tcga:brca.luma-subtype",
                       "tcga:brca.lumb-subtype")

sd_genes_vs_ss_df <- sd_genes_df %>%
  filter(dataset_name %in% datasets_selected) %>%
  group_by(dataset, type_dataset, dataset_name) %>% 
  summarize(num_genes_above_sd_cut=length(sd[sd >= sd_cut]), 
            num_genes_above_mean_cut=length(mean[mean >= mean_cut]), 
            num_genes_above_cv_cut=length(cv[cv >= cv_cut]), 
            num_genes_total=n()
            ) %>% 
  ungroup() %>% 
  mutate(fraction_genes_above_sd_cut = num_genes_above_sd_cut / num_genes_total, 
         fraction_genes_above_mean_cut = num_genes_above_mean_cut / num_genes_total, 
         fraction_genes_above_cv_cut = num_genes_above_cv_cut / num_genes_total) %>% 
  inner_join(N_opt_df, by=c("dataset_name")) %>% 
  unique() %>%
  as.data.frame()
## `summarise()` has grouped output by 'dataset', 'type_dataset'. You can override
## using the `.groups` argument.
sd_genes_vs_ss_file = paste(tables_dir,
                            "/sd_vs_optimal_sample_size_",
                            fraction_links,
                            "links",
                            sep = "")

sd_genes_vs_ss_df %>% 
  fwrite(sd_genes_vs_ss_file)

# Modify dataset names for the plots
sd_genes_vs_ss_df = sd_genes_vs_ss_df %>%
  mutate(dataset_name = 
           replace(dataset_name, 
                   dataset_name == "tcga:tcga-brca-tumor_female", 
                   "TCGA: Breast cancer")) %>%
  mutate(dataset_name = 
           replace(dataset_name, 
                   dataset_name == "gtex:breast.mammary.tissue_female", 
                   "GTEx: Breast")) %>%
  # mutate(dataset_name = 
  #          replace(dataset_name, 
  #                  dataset_name == "scipher:scipher.sample.per.patient.baseline", 
  #                  "R. arthritis")) %>%
  # mutate(dataset_name = 
  #          replace(dataset_name, 
  #                  dataset_name == "gtex:whole.blood",
  #                  "GTEx: Whole blood")) %>%
  # #mutate(dataset_name = replace(dataset_name, dataset_name == "gtex:thyroid", "GTEx: Thyroid")) %>%
  # mutate(dataset_name = 
  #          replace(dataset_name, 
  #                  dataset_name == "gtex:lung", 
  #                  "GTEx: Lung")) %>%
  # #mutate(dataset_name = replace(dataset_name, dataset_name == "gtex:artery.tibial", "GTEx: Artery tibial")) %>%
  # #mutate(dataset_name = replace(dataset_name, dataset_name == "tcga:tcga-coad", "TCGA: Colon cancer")) %>%
  # #mutate(dataset_name = replace(dataset_name, dataset_name == "tcga:tcga-thca", "TCGA: Thyroid cancer")) %>%
  # mutate(dataset_name = 
  #          replace(dataset_name, 
  #                  dataset_name == "tcga:tcga-luad-tumor", 
  #                  "TCGA: Lung cancer")) %>%
  left_join( (name2color %>% 
                as.data.frame() %>% 
                dplyr::rename("rgb_col"=".") %>% 
                tibble::rownames_to_column("dataset") #%>% 
                #filter(dataset_name %in% c("TCGA: Breast cancer", "GTEx: Breast"))
              ), 
             by=c("dataset") ) %>%
  mutate(rgb_col = replace(rgb_col, is.na(rgb_col), "#000000")) %>% # Non-selected datasets with color black
  mutate(geom_text_repel_label=dplyr::if_else(dataset_name == "R. arthritis", "R. arthritis", 
                                              dplyr::if_else(dataset_name == "GTEx: Whole blood", "GTEx: Whole blood", 
                                                             dplyr::if_else(dataset_name == "GTEx: Lung", "GTEx: Lung", 
                                                                            dplyr::if_else(dataset_name == "GTEx: Breast", "GTEx: Breast", 
                                                                                           dplyr::if_else(dataset_name == "TCGA: Lung cancer", "TCGA: Lung cancer", 
                                                                                                          dplyr::if_else(dataset_name == "TCGA: Breast cancer", "TCGA: Breast cancer", "")))))))

sd_genes_vs_ss_df
##    dataset                        type_dataset
## 1     gtex               Breast.Mammary.Tissue
## 2     gtex                                Lung
## 3     gtex          Skin.Sun.Exposed.Lower.leg
## 4     gtex                             Thyroid
## 5     gtex                         Whole.Blood
## 6  scipher scipher.sample.per.patient.baseline
## 7     tcga                           BRCA.LumA
## 8     tcga                           BRCA.LumB
## 9     tcga                              Breast
## 10    tcga                              Kidney
## 11    tcga                                Lung
## 12    tcga                           TCGA-BRCA
## 13    tcga                           TCGA-KIRC
## 14    tcga                           TCGA-KIRP
## 15    tcga                           TCGA-LUAD
## 16    tcga                           TCGA-LUSC
## 17    tcga                           TCGA-THCA
##                                   dataset_name num_genes_above_sd_cut
## 1                                 GTEx: Breast                  17455
## 2                                    gtex:lung                  17469
## 3              gtex:skin.sun.exposed.lower.leg                  17083
## 4                                 gtex:thyroid                  17197
## 5                             gtex:whole.blood                  14927
## 6  scipher:scipher.sample.per.patient.baseline                  12480
## 7                       tcga:brca.luma-subtype                  18712
## 8                       tcga:brca.lumb-subtype                  18503
## 9                    tcga:breast-tissue_female                  18564
## 10                          tcga:kidney-tissue                  18302
## 11                            tcga:lung-tissue                  18160
## 12                         TCGA: Breast cancer                  18804
## 13                        tcga:tcga-kirc-tumor                  18776
## 14                        tcga:tcga-kirp-tumor                  18098
## 15                        tcga:tcga-luad-tumor                  18965
## 16                        tcga:tcga-lusc-tumor                  19175
## 17                        tcga:tcga-thca-tumor                  18081
##    num_genes_above_mean_cut num_genes_above_cv_cut num_genes_total
## 1                     17684                   8796           18132
## 2                     17623                  10459           18030
## 3                     17279                   8427           17804
## 4                     17411                   7840           17870
## 5                     14868                  14607           15148
## 6                     13189                   4163           13757
## 7                     18777                  13892           19181
## 8                     18629                  15160           19017
## 9                     19107                  10259           19826
## 10                    18811                   9001           19424
## 11                    18484                  13582           19141
## 12                    18804                  16231           19118
## 13                    18888                  13129           19310
## 14                    18214                  14830           18549
## 15                    18950                  18700           19200
## 16                    19239                  17100           19537
## 17                    18330                  10249           18827
##    fraction_genes_above_sd_cut fraction_genes_above_mean_cut
## 1                    0.9626627                     0.9752923
## 2                    0.9688852                     0.9774265
## 3                    0.9595035                     0.9705122
## 4                    0.9623391                     0.9743145
## 5                    0.9854106                     0.9815157
## 6                    0.9071745                     0.9587119
## 7                    0.9755487                     0.9789375
## 8                    0.9729716                     0.9795972
## 9                    0.9363462                     0.9637345
## 10                   0.9422364                     0.9684411
## 11                   0.9487488                     0.9656758
## 12                   0.9835757                     0.9835757
## 13                   0.9723459                     0.9781460
## 14                   0.9756860                     0.9819397
## 15                   0.9877604                     0.9869792
## 16                   0.9814711                     0.9847469
## 17                   0.9603761                     0.9736017
##    fraction_genes_above_cv_cut   s        a      N_opt rgb_col
## 1                    0.4851092 0.5 1.811947  346.25815 #00BA37
## 2                    0.5800887 0.5 1.654318  547.29617 #00BA37
## 3                    0.4733206 0.5 1.836919  304.64639 #00BA37
## 4                    0.4387241 0.5 1.746114  409.71393 #00BA37
## 5                    0.9642857 0.5 1.650687  375.10439 #00BA37
## 6                    0.3026096 0.5 1.758283  321.23471 #D55E00
## 7                    0.7242584 0.5 1.528906 2190.78289 #0072B2
## 8                    0.7971815 0.5 1.441041 5950.41238 #0072B2
## 9                    0.5174518 0.5 1.836527  314.92674 #0072B2
## 10                   0.4633958 0.5 1.771018  384.51866 #0072B2
## 11                   0.7095763 0.5 2.183346   94.04511 #0072B2
## 12                   0.8489905 0.5 1.478753 3914.54702 #0072B2
## 13                   0.6799068 0.5 1.565623 1373.65244 #0072B2
## 14                   0.7995040 0.5 1.668704  741.92255 #0072B2
## 15                   0.9739583 0.5 1.649890  899.08309 #0072B2
## 16                   0.8752623 0.5 1.549679 2160.52992 #0072B2
## 17                   0.5443778 0.5 1.610579  929.03972 #0072B2
##    geom_text_repel_label
## 1           GTEx: Breast
## 2                       
## 3                       
## 4                       
## 5                       
## 6                       
## 7                       
## 8                       
## 9                       
## 10                      
## 11                      
## 12   TCGA: Breast cancer
## 13                      
## 14                      
## 15                      
## 16                      
## 17

We plot the results of log10 alpha vs log10 sample size:

# Calculate correlation
ss_vs_a_cor <- cor(log10(sd_genes_vs_ss_df$N_opt),
                   log10(sd_genes_vs_ss_df$a))

# Calculate regression
lm_summary_ssvsa <- summary(lm(log10(sd_genes_vs_ss_df$a) ~ 
                           log10(sd_genes_vs_ss_df$N_opt)))
slope_ssvsa <- coef(lm_summary_ssvsa)[2]
intercept_ssvsa <- coef(lm_summary_ssvsa)[1]
adj.r.squared_ssvsa <- lm_summary_ssvsa$adj.r.squared
lm_cor_result_ssvsa <- paste("R² =", 
                       round(adj.r.squared_ssvsa, 3), 
                       "\ncorr. =", 
                       round(ss_vs_a_cor, 3))

a_vs_ss_plot <- sd_genes_vs_ss_df %>%
    ggplot(aes(x = log10(N_opt), 
               y = log10(a), 
               col = rgb_col)) + 
    geom_point(alpha = 0.6, size = 3) +
    scale_colour_identity() +
    geom_label_repel(aes(col = rgb_col, 
                         label = geom_text_repel_label),
                     fill = "white",
                     box.padding = 0.5,
                     max.iter = 1e5,
                     label.size = 0.75, 
                     size = 5,
                     alpha = 0.6, 
                     label.padding = 0.25, 
                     fontface = "bold",
                     na.rm = TRUE,
                     seed = 1234) +
    new_scale_color() +
    geom_abline(aes(slope = slope_ssvsa,
                    intercept = intercept_ssvsa,
                    col = lm_cor_result_ssvsa),
              linetype = "dashed", 
              show.legend = FALSE) +
    scale_color_manual(values = c("gray50")) +
    #annotate("text", x = 4500, y = 2.05, label = lm_cor_result_ssvsa, size = 5) +
    annotate("text", x = 3.3, y = 0.32, label = lm_cor_result_ssvsa, size = 5) +
    xlab("log10(n) to get 50% sig. links") +
    ylab(expression(bold(log10(alpha)))) +
    theme_linedraw() +
    theme(aspect.ratio = 1, 
          plot.title =  element_text(size = 20, face="bold"), 
          axis.title = element_text(size = 17, face="bold"), 
          axis.text = element_text(size = 16), 
          panel.grid.major=element_blank(), 
          panel.grid.minor = element_blank(),
          text = element_text(family = "Helvetica"),
          legend.position="none")

plot_file = paste(plots_dir, "/ss_vs_a.png", sep="")
ggsave(
  plot_file,
  plot = a_vs_ss_plot,
  dpi = 1200,
  width = 8000,
  #height = 6000,
  units = c("px")
)
## Saving 8000 x 6000 px image
print(a_vs_ss_plot)

We plot the results of alpha vs sample size (lin-lin):

# Calculate correlation
ss_vs_a_cor <- cor(sd_genes_vs_ss_df$N_opt,
                   sd_genes_vs_ss_df$a)

# Calculate regression
lm_summary_ssvsa_linlin <- summary(lm(sd_genes_vs_ss_df$a ~ 
                           sd_genes_vs_ss_df$N_opt))
slope_ssvsa_linlin <- coef(lm_summary_ssvsa_linlin)[2]
intercept_ssvsa_linlin <- coef(lm_summary_ssvsa_linlin)[1]
adj.r.squared_ssvsa <- lm_summary_ssvsa_linlin$adj.r.squared
lm_cor_result_ssvsa <- paste("R² =", 
                       round(adj.r.squared_ssvsa, 3), 
                       "\ncorr. =", 
                       round(ss_vs_a_cor, 3))

a_vs_ss_lin_lin_plot <- sd_genes_vs_ss_df %>%
    ggplot(aes(x = N_opt, 
               y = a, 
               col = rgb_col)) + 
    geom_point(alpha = 0.6, size = 3) +
    scale_colour_identity() +
    geom_label_repel(aes(col = rgb_col, 
                         label = geom_text_repel_label),
                     fill = "white",
                     box.padding = 0.5, 
                     max.iter = 1e5,
                     label.size = 0.75, 
                     size = 5,
                     alpha = 0.6, 
                     label.padding=0.25, 
                     fontface = "bold",
                     na.rm = TRUE,
                     seed = 1234) +
    new_scale_color() +
    geom_abline(aes(slope = slope_ssvsa_linlin,
                    intercept = intercept_ssvsa_linlin,
                    col = lm_cor_result_ssvsa),
              linetype = "dashed", 
              show.legend = FALSE) +
    scale_color_manual(values = c("gray50")) +
    #annotate("text", x = 4500, y = 2.05, label = lm_cor_result_ssvsa, size = 5) +
    annotate("text", x = 4000, y = 2, label = lm_cor_result_ssvsa, size = 5) +
    xlab("n to get 50% sig. links") +
    ylab(expression(bold(alpha))) +
    theme_linedraw() +
    theme(aspect.ratio = 1, 
          plot.title =  element_text(size = 20, face="bold"), 
          axis.title = element_text(size = 17, face="bold"), 
          axis.text = element_text(size = 16), 
          panel.grid.major=element_blank(), 
          panel.grid.minor = element_blank(),
          text = element_text(family = "Helvetica"),
          legend.position="none")
print(a_vs_ss_lin_lin_plot)

We plot the results of log10 alpha vs sample size:

# Calculate correlation
ss_vs_a_cor <- cor(sd_genes_vs_ss_df$N_opt,
                   log10(sd_genes_vs_ss_df$a))

# Calculate regression
lm_summary_ssvsa_loglin <- summary(lm(log10(sd_genes_vs_ss_df$a) ~ 
                           sd_genes_vs_ss_df$N_opt))
slope_ssvsa_loglin <- coef(lm_summary_ssvsa_loglin)[2]
intercept_ssvsa_loglin <- coef(lm_summary_ssvsa_loglin)[1]
adj.r.squared_ssvsa <- lm_summary_ssvsa_loglin$adj.r.squared
lm_cor_result_ssvsa <- paste("R² =", 
                       round(adj.r.squared_ssvsa, 3), 
                       "\ncorr. =", 
                       round(ss_vs_a_cor, 3))

a_vs_ss_log_lin_plot <- sd_genes_vs_ss_df %>%
    ggplot(aes(x = N_opt, 
               y = log10(a), 
               col = rgb_col)) + 
    geom_point(alpha = 0.6, size = 3) +
    scale_colour_identity() +
    geom_label_repel(aes(col = rgb_col, 
                         label = geom_text_repel_label),
                     fill = "white",
                     box.padding = 0.5, 
                     max.overlaps = Inf,
                     label.size = 0.75, 
                     size = 5,
                     alpha = 0.6, 
                     label.padding=0.25, 
                     fontface = "bold",
                     na.rm = TRUE,
                     seed = 1234) +
    new_scale_color() +
    geom_abline(aes(slope = slope_ssvsa_loglin,
                    intercept = intercept_ssvsa_loglin,
                    col = lm_cor_result_ssvsa),
              linetype = "dashed", 
              show.legend = FALSE) +
    scale_color_manual(values = c("gray50")) +
    #annotate("text", x = 4500, y = 2.05, label = lm_cor_result_ssvsa, size = 5) +
    annotate("text", x = 4000, y = 0.3, label = lm_cor_result_ssvsa, size = 5) +
    xlab("n to get 50% sig. links") +
    ylab(expression(bold(log10(alpha)))) +
    theme_linedraw() +
    theme(aspect.ratio = 1, 
          plot.title =  element_text(size = 20, face="bold"), 
          axis.title = element_text(size = 17, face="bold"), 
          axis.text = element_text(size = 16), 
          panel.grid.major=element_blank(), 
          panel.grid.minor = element_blank(),
          text = element_text(family = "Helvetica"),
          legend.position="none")
print(a_vs_ss_log_lin_plot)

We plot the results of alpha vs log10 sample size:

# Calculate correlation
ss_vs_a_cor <- cor(log10(sd_genes_vs_ss_df$N_opt),
                   sd_genes_vs_ss_df$a)

# Calculate regression
lm_summary_ssvsa_linlog <- summary(lm(sd_genes_vs_ss_df$a ~ 
                           log10(sd_genes_vs_ss_df$N_opt)))
slope_ssvsa_linlog <- coef(lm_summary_ssvsa_linlog)[2]
intercept_ssvsa_linlog <- coef(lm_summary_ssvsa_linlog)[1]
adj.r.squared_ssvsa <- lm_summary_ssvsa_linlog$adj.r.squared
lm_cor_result_ssvsa <- paste("R² =", 
                       round(adj.r.squared_ssvsa, 3), 
                       "\ncorr. =", 
                       round(ss_vs_a_cor, 3))

a_vs_ss_lin_log_plot <- sd_genes_vs_ss_df %>%
    ggplot(aes(x = log10(N_opt), 
               y = a, 
               col = rgb_col)) + 
    geom_point(alpha = 0.6, size = 3) +
    scale_colour_identity() +
    geom_label_repel(aes(col = rgb_col, 
                         label = geom_text_repel_label),
                     fill = "white",
                     box.padding = 0.5, 
                     max.iter = 1e5,
                     label.size = 0.75, 
                     size = 5,
                     alpha = 0.6, 
                     label.padding=0.25, 
                     fontface = "bold",
                     na.rm = TRUE,
                     seed = 1234) +
    new_scale_color() +
    geom_abline(aes(slope = slope_ssvsa_linlog,
                    intercept = intercept_ssvsa_linlog,
                    col = lm_cor_result_ssvsa),
              linetype = "dashed", 
              show.legend = FALSE) +
    scale_color_manual(values = c("gray50")) +
    #annotate("text", x = 4500, y = 2.05, label = lm_cor_result_ssvsa, size = 5) +
    annotate("text", x = 3.3, y = 2, label = lm_cor_result_ssvsa, size = 5) +
    xlab("log10(n) to get 50% sig. links") +
    ylab(expression(bold(alpha))) +
    theme_linedraw() +
    theme(aspect.ratio = 1, 
          plot.title =  element_text(size = 20, face="bold"), 
          axis.title = element_text(size = 17, face="bold"), 
          axis.text = element_text(size = 16), 
          panel.grid.major=element_blank(), 
          panel.grid.minor = element_blank(),
          text = element_text(family = "Helvetica"),
          legend.position="none")
print(a_vs_ss_lin_log_plot)

We plot all results of alpha vs. sample size together:

layout <- "
AB
CD
"

((a_vs_ss_lin_lin_plot +
    labs(tag = 'A') +
    theme(plot.tag.position = c(.07, 1.04),
          plot.margin = margin(0.6, 0.6, 0.6, 0.6, "cm"))
  ) +
    (a_vs_ss_lin_log_plot +
       labs(tag = 'B') +
       theme(plot.tag.position = c(.07, 1.04),
             plot.margin = margin(0.6, 0.6, 0.6, 0.6, "cm"))
     ) +
    (a_vs_ss_log_lin_plot +
       labs(tag = 'C') +
       theme(plot.tag.position = c(.07, 1.04),
             plot.margin = margin(0.6, 0.6, 0.6, 0.6, "cm"))
     ) +
    (a_vs_ss_plot +
       labs(tag = 'D') +
       theme(plot.tag.position = c(.07, 1.04),
             plot.margin = margin(0.6, 0.6, 0.6, 0.6, "cm"))
     )) +
  plot_layout(design = layout) &
  theme(plot.tag = element_text(face = 'bold', size = 17))

plot_file = paste(plots_dir, "/ss_vs_a_all_combinations.png", sep="")
ggsave(
  plot_file,
  dpi = 1200,
  width = 12000,
  height = 11000,
  units = c("px")
)

We plot the results of CV vs sample size:

# Calculate correlation
cv_vs_ss_cor <- cor(sd_genes_vs_ss_df$N_opt, 
                   sd_genes_vs_ss_df$fraction_genes_above_cv_cut)

# Calculate regression
lm_summary_cvvsss <- summary(lm(sd_genes_vs_ss_df$fraction_genes_above_cv_cut ~ 
                           sd_genes_vs_ss_df$N_opt))
slope_cvvsss <- coef(lm_summary_cvvsss)[2]
intercept_cvvsss <- coef(lm_summary_cvvsss)[1]
adj.r.squared_cvvsss <- lm_summary_cvvsss$adj.r.squared
lm_cor_result_cvvsss <- paste("R² =", 
                       round(adj.r.squared_cvvsss, 3), 
                       "\ncorr. =", 
                       round(cv_vs_ss_cor, 3))


cv_vs_ss_plot <- sd_genes_vs_ss_df %>%
    ggplot(aes(x = N_opt, 
               y = fraction_genes_above_cv_cut, 
               col = rgb_col)) + 
    geom_point(alpha = 0.6, size = 3) +
    scale_colour_identity() +
    geom_label_repel(aes(col = rgb_col, 
                         label = geom_text_repel_label),
                     fill = "white",
                     box.padding = 0.5, 
                     max.overlaps = Inf,
                     label.size = 0.75, 
                     size = 5,
                     alpha = 0.6, 
                     label.padding=0.25, 
                     fontface = "bold",
                     na.rm = TRUE,
                     seed = 1234) +
    new_scale_color() +
    geom_abline(aes(slope = slope_cvvsss,
                    intercept = intercept_cvvsss,
                    col = lm_cor_result_cvvsss),
              linetype = "dashed", 
              show.legend = FALSE) +
    scale_color_manual(values = c("gray50")) +
    annotate("text", x = 4500, y = 0.41, label = lm_cor_result_cvvsss, size = 5) +
    xlab("Num. samples for 50% sig. links") +
    ylab(paste("Frac. genes with CV above ", 
               cv_cut, 
               sep = "")) +
    theme_linedraw() +
    theme(aspect.ratio = 1, 
          plot.title =  element_text(size = 20, face="bold"), 
          axis.title = element_text(size = 17, face="bold"), 
          axis.text = element_text(size = 16), 
          panel.grid.major=element_blank(), 
          panel.grid.minor = element_blank(),
          text = element_text(family = "Helvetica"),
          legend.position="none")

plot_file = paste(plots_dir, "/ss_vs_fraction_genes_above_cv_cut.png", sep="")
ggsave(
  plot_file,
  plot = cv_vs_ss_plot,
  dpi = 1200,
  width = 8000,
  #height = 6000,
  units = c("px")
)
## Saving 8000 x 6000 px image
print(cv_vs_ss_plot)

We plot the results of SD vs sample size:

# Calculate correlation
sd_vs_ss_cor <- cor(sd_genes_vs_ss_df$N_opt, 
                   sd_genes_vs_ss_df$fraction_genes_above_sd_cut)

# Calculate regression
lm_summary_sdvsss <- summary(lm(sd_genes_vs_ss_df$fraction_genes_above_sd_cut ~ 
                           sd_genes_vs_ss_df$N_opt))
slope_sdvsss <- coef(lm_summary_sdvsss)[2]
intercept_sdvsss <- coef(lm_summary_sdvsss)[1]
adj.r.squared_sdvsss <- lm_summary_sdvsss$adj.r.squared
lm_cor_result_sdvsss <- paste("R² =", 
                       round(adj.r.squared_sdvsss, 3), 
                       "\ncorr. =", 
                       round(sd_vs_ss_cor, 3))


sd_vs_ss_plot <- sd_genes_vs_ss_df %>%
    ggplot(aes(x = N_opt, 
               y = fraction_genes_above_sd_cut, 
               col = rgb_col)) + 
    geom_point(alpha = 0.6, size = 3) +
    scale_colour_identity() +
    geom_label_repel(aes(col = rgb_col, 
                         label = geom_text_repel_label),
                     fill = "white",
                     box.padding = 0.5, 
                     #max.overlaps = Inf,
                     max.iter = 1e5,
                     label.size = 0.75, 
                     size = 5,
                     alpha = 0.6, 
                     label.padding=0.25, 
                     fontface = "bold",
                     na.rm = TRUE,
                     seed = 1234) +
    new_scale_color() +
    geom_abline(aes(slope = slope_sdvsss,
                    intercept = intercept_sdvsss,
                    col = lm_cor_result_sdvsss),
              linetype = "dashed", 
              show.legend = FALSE) +
    scale_color_manual(values = c("gray50")) +
    annotate("text", x = 3500, y = 0.93, label = lm_cor_result_sdvsss, size = 5) +
    xlab("Num. samples for 50% sig. links") +
    ylab(paste("Frac. genes with SD above ", 
               sd_cut, 
               sep = "")) +
    theme_linedraw() +
    theme(aspect.ratio = 1, 
          plot.title =  element_text(size = 20, face="bold"), 
          axis.title = element_text(size = 17, face="bold"), 
          axis.text = element_text(size = 16), 
          panel.grid.major=element_blank(), 
          panel.grid.minor = element_blank(),
          text = element_text(family = "Helvetica"),
          legend.position="none")

plot_file = paste(plots_dir, "/ss_vs_fraction_genes_above_sd_cut.png", sep="")
ggsave(
  plot_file,
  plot = sd_vs_ss_plot,
  dpi = 1200,
  width = 8000,
  #height = 6000,
  units = c("px")
)
## Saving 8000 x 6000 px image
print(sd_vs_ss_plot)

Figure 2: Panel containing power law demonstration plots

Create a panel with all the results:

# # Use patchwork to concatenate plots
# (((curve_plot + labs(tag = 'A')) |
#   (scaling_relation_plot + labs(tag = 'B')) |
#   (comparison_plot + labs(tag = 'C'))) /
#   (wrap_elements(figure_summary_table_plot) + labs(tag='D')) / 
#   ((prediction_plot + labs(tag = 'E')) |
#   (a_vs_fraction_sig_correlations_plot + labs(tag = 'F')) |
#   (scatter_plots[[3]] + labs(tag='G')))) & 
#   theme(element_text(face = 'bold', size = 17)) # bold tags

layout <- "
ABC
DDD
EFG
"

((goodnessfit_plot_lessdatasets +
    labs(tag = 'A') +
    theme(plot.tag.position = c(.09, 1.04),
          plot.margin = margin(0.6, 0.6, 0.6, 0.6, "cm"))
  ) +
    (scaling_relation_plot_lessdatasets +
       labs(tag = 'B') +
       theme(plot.tag.position = c(.07, 1.04),
             plot.margin = margin(0.6, 0.6, 0.6, 0.6, "cm"))
     ) +
    (prediction_plot_lessdatasets +
       labs(tag = 'C') +
       theme(plot.tag.position = c(.07, 1.04),
             plot.margin = margin(0.6, 0.6, 0.6, 0.6, "cm"))
     ) +
    (wrap_elements(figure_summary_table_plot) +
       labs(tag='D') +
       theme(plot.tag.position = c(.20, .87),
             plot.margin = margin(0.6, 0.6, 0.6, 0.6, "cm"))
     ) +
    (a_vs_fraction_sig_correlations_lessdatasets_plot +
       labs(tag = 'E') +
       theme(plot.tag.position = c(.09, 1.04),
             plot.margin = margin(0.6, 0.6, 0.6, 0.6, "cm"))
     ) +
    (scatter_plots_lessdatasets[[3]] +
       labs(tag = 'F') +
       theme(plot.tag.position = c(.04, 1.04),
             plot.margin = margin(0.6, 0.6, 0.6, 0.6, "cm"))
     ) +
    (a_vs_ss_lin_log_plot +
       labs(tag='G')
     ) +
    theme(plot.tag.position  = c(.04, 1.04),
          plot.margin = margin(0.6, 0.6, 0.6, 0.6, "cm"))
  ) +
  plot_layout(design = layout) &
  theme(plot.tag = element_text(face = 'bold', size = 17))
## Warning: Removed 9938 rows containing missing values (`geom_point()`).
## Warning: ggrepel: 2 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

plot_file = paste(plots_dir, "/modelling_and_cv_results.png", sep="")
ggsave(
  plot_file,
  dpi = 1200,
  width = 20500,
  #height = 12500,
  height = 18000,
  units = c("px")
)
## Warning: Removed 9938 rows containing missing values (`geom_point()`).
# (((curve_plot +
#       labs(tag = 'A') +
#       theme(legend.position="none")) | 
#     (comparison_plot +
#       labs(tag = 'D') +
#       guides(col=guide_legend(title="Model", title.position="top")) +
#       theme(legend.position = c(0.7, 0.24), legend.text = element_text(size = 11), legend.title=element_text(size=13, face="bold"), legend.background = element_rect(size=0.2, linetype="solid", colour ="black"))) |
#     (a_vs_fraction_sig_correlations_plot +
#      labs(tag = 'F') +
#      theme(legend.position="none"))) /
#   ((scaling_relation_plot +
#       labs(tag = 'B') +
#       theme(legend.position="none")) | 
#     (prediction_plot +
#       labs(tag = 'E') +
#       #guides(col=guide_legend(title="Dataset", title.position="top", nrow=3,byrow=TRUE)) +
#       #theme(legend.position = 'bottom', legend.text = element_text(size = 11), legend.title=element_text(size=13, face="bold"))) |
#       theme(legend.position = 'none')) |
#      (distribution_plots[[3]] +
#      labs(tag = 'G') +
#      theme(legend.position="none"))) /
#    ((wrap_elements(figure_summary_table_plot) + labs(tag='C')) | 
#       (scatter_plots[[3]] + labs(tag='H')))) & 
#   #plot_annotation(tag_levels = 'A') & # Include letters 
#   theme(element_text(face = 'bold', size = 17)) # that are bold

Levels of correlations plot

Define function to calculate critical sample size for correlation:

calculate_sample_size_for_pearson_correlation_using_t_distribution_with_optimization = function(r, total_num_edges, alpha=0.05, N_guess=c(10000)){
  optimize_N = function(par){
    N = par[1]
    t_guess = qt(alpha, df=N-2, lower.tail = F)
    t = r * sqrt((N-2)) / sqrt((1-r**2))
    return((abs(t-t_guess)))
  }
  res = optim(par=N_guess, fn=optimize_N, method="Brent", lower=3, upper=50000)
  return(res$par)
}

calculate_sample_size_for_pearson_correlation_using_z_distribution_with_power = function(r, total_num_edges, alpha=0.05, power=0.8){
  alpha_corrected = alpha / total_num_edges # Correct alpha by multiple error
  Za = qnorm(alpha, lower.tail=F)
  Za_corrected = qnorm(alpha_corrected, lower.tail=F)
  Zb = qnorm(power, lower.tail=F)
  N = ((Za + Zb) / (0.5 * log((1+r)/(1-r))))**2 + 3
  N_corrected = ((Za_corrected + Zb) / (0.5 * log((1+r)/(1-r))))**2 + 3
  return(list(N=N, N_corrected=N_corrected))
}

calculate_sample_size_for_pearson_correlation_using_z_distribution = function(r, alpha=0.05){
  Za = qnorm(alpha, lower.tail=F)
  N = r / (2*Za - log((1+r)/(1-r))) + 1
  return(N)
}

calculate_sample_size_for_pearson_correlation_using_bonnet_and_wright = function(r, w, alpha=0.05){
  Za = qnorm(alpha, lower.tail=F)
  N0 = round(4 * (1-r**2)**2 * (Za/w)**2)
  L1 = 0.5 * (log(1+r) - log(1-r)) - Za/sqrt(N0-3)
  L2 = 0.5 * (log(1+r) - log(1-r)) + Za/sqrt(N0-3)
  w0 = ((exp(2*L2) - 1) / (exp(2*L2) + 1)) - ((exp(2*L1) - 1) / (exp(2*L1) + 1))
  N = ((N0 - 3) * (w0 / w)**2) + 3
  return(N)
}

calculate_predictions_using_stretched_exponential_model_optimized = function(x, L, a, b){
  y = L * exp((b*x**(-a+1))/(-a+1))
  #y = exp(log(L) - exp(b+a*log(x)))
  return(y)
}

Calculate sample size for different levels of correlation and different datasets:

type_correlation_df = data.frame(type_correlation=c("weak", "moderate", "strong", "very strong"), lower_val=c(0.2,0.4,0.6,0.8), upper_val=c(0.4,0.6,0.8,NA))
types_correlation = c("weak", "moderate", "strong", "very strong")

cols = c("dataset", "type_correlation", "correlation", "sample_size_statistical", "sample_size_statistical_corrected", "sample_size_z")
sample_size_correlation_df = data.frame(matrix(ncol=length(cols),nrow=0, dimnames=list(NULL, cols)))
for (dataset in numbers_df$dataset){
  total_num_edges = (numbers_df %>% filter(dataset == !!dataset))$total_num_edges
  for (type_correlation in types_correlation){
    r = (type_correlation_df %>% filter(type_correlation == !!type_correlation))$lower_val
    N = calculate_sample_size_for_pearson_correlation_using_t_distribution_with_optimization(r=r, total_num_edges=total_num_edges, alpha=0.05, N_guess=c(10000))
    N_corrected = calculate_sample_size_for_pearson_correlation_using_t_distribution_with_optimization(r=r, total_num_edges=total_num_edges, alpha=0.05/total_num_edges, N_guess=c(10000))
    N_z = calculate_sample_size_for_pearson_correlation_using_bonnet_and_wright(r=r, w=0.8, alpha=0.05/total_num_edges)
    sample_size_correlation_df <- rbind(sample_size_correlation_df, data.frame(dataset=dataset, type_correlation=type_correlation, correlation=r, sample_size_statistical=N, sample_size_statistical_corrected=N_corrected, sample_size_z=N_z))
  }
}

print(sample_size_correlation_df)
##                                        dataset type_correlation correlation
## 1             scipher:scipher.complete.dataset             weak         0.2
## 2             scipher:scipher.complete.dataset         moderate         0.4
## 3             scipher:scipher.complete.dataset           strong         0.6
## 4             scipher:scipher.complete.dataset      very strong         0.8
## 5  scipher:scipher.sample.per.patient.baseline             weak         0.2
## 6  scipher:scipher.sample.per.patient.baseline         moderate         0.4
## 7  scipher:scipher.sample.per.patient.baseline           strong         0.6
## 8  scipher:scipher.sample.per.patient.baseline      very strong         0.8
## 9                        tcga:tcga-brca_female             weak         0.2
## 10                       tcga:tcga-brca_female         moderate         0.4
## 11                       tcga:tcga-brca_female           strong         0.6
## 12                       tcga:tcga-brca_female      very strong         0.8
## 13                              tcga:tcga-kirc             weak         0.2
## 14                              tcga:tcga-kirc         moderate         0.4
## 15                              tcga:tcga-kirc           strong         0.6
## 16                              tcga:tcga-kirc      very strong         0.8
## 17                              tcga:tcga-kirp             weak         0.2
## 18                              tcga:tcga-kirp         moderate         0.4
## 19                              tcga:tcga-kirp           strong         0.6
## 20                              tcga:tcga-kirp      very strong         0.8
## 21                              tcga:tcga-luad             weak         0.2
## 22                              tcga:tcga-luad         moderate         0.4
## 23                              tcga:tcga-luad           strong         0.6
## 24                              tcga:tcga-luad      very strong         0.8
## 25                              tcga:tcga-lusc             weak         0.2
## 26                              tcga:tcga-lusc         moderate         0.4
## 27                              tcga:tcga-lusc           strong         0.6
## 28                              tcga:tcga-lusc      very strong         0.8
## 29                              tcga:tcga-thca             weak         0.2
## 30                              tcga:tcga-thca         moderate         0.4
## 31                              tcga:tcga-thca           strong         0.6
## 32                              tcga:tcga-thca      very strong         0.8
## 33                          tcga:breast_female             weak         0.2
## 34                          tcga:breast_female         moderate         0.4
## 35                          tcga:breast_female           strong         0.6
## 36                          tcga:breast_female      very strong         0.8
## 37                                 tcga:kidney             weak         0.2
## 38                                 tcga:kidney         moderate         0.4
## 39                                 tcga:kidney           strong         0.6
## 40                                 tcga:kidney      very strong         0.8
## 41                                   tcga:lung             weak         0.2
## 42                                   tcga:lung         moderate         0.4
## 43                                   tcga:lung           strong         0.6
## 44                                   tcga:lung      very strong         0.8
## 45                              tcga:brca.luma             weak         0.2
## 46                              tcga:brca.luma         moderate         0.4
## 47                              tcga:brca.luma           strong         0.6
## 48                              tcga:brca.luma      very strong         0.8
## 49                              tcga:brca.lumb             weak         0.2
## 50                              tcga:brca.lumb         moderate         0.4
## 51                              tcga:brca.lumb           strong         0.6
## 52                              tcga:brca.lumb      very strong         0.8
## 53                  gtex:breast.mammary.tissue             weak         0.2
## 54                  gtex:breast.mammary.tissue         moderate         0.4
## 55                  gtex:breast.mammary.tissue           strong         0.6
## 56                  gtex:breast.mammary.tissue      very strong         0.8
## 57           gtex:breast.mammary.tissue_female             weak         0.2
## 58           gtex:breast.mammary.tissue_female         moderate         0.4
## 59           gtex:breast.mammary.tissue_female           strong         0.6
## 60           gtex:breast.mammary.tissue_female      very strong         0.8
## 61                                   gtex:lung             weak         0.2
## 62                                   gtex:lung         moderate         0.4
## 63                                   gtex:lung           strong         0.6
## 64                                   gtex:lung      very strong         0.8
## 65             gtex:skin.sun.exposed.lower.leg             weak         0.2
## 66             gtex:skin.sun.exposed.lower.leg         moderate         0.4
## 67             gtex:skin.sun.exposed.lower.leg           strong         0.6
## 68             gtex:skin.sun.exposed.lower.leg      very strong         0.8
## 69                                gtex:thyroid             weak         0.2
## 70                                gtex:thyroid         moderate         0.4
## 71                                gtex:thyroid           strong         0.6
## 72                                gtex:thyroid      very strong         0.8
## 73                            gtex:whole.blood             weak         0.2
## 74                            gtex:whole.blood         moderate         0.4
## 75                            gtex:whole.blood           strong         0.6
## 76                            gtex:whole.blood      very strong         0.8
## 77                gse193677:rectum_cd_inflamed             weak         0.2
## 78                gse193677:rectum_cd_inflamed         moderate         0.4
## 79                gse193677:rectum_cd_inflamed           strong         0.6
## 80                gse193677:rectum_cd_inflamed      very strong         0.8
## 81        gse193677:rectum_control_noninflamed             weak         0.2
## 82        gse193677:rectum_control_noninflamed         moderate         0.4
## 83        gse193677:rectum_control_noninflamed           strong         0.6
## 84        gse193677:rectum_control_noninflamed      very strong         0.8
## 85                gse193677:rectum_uc_inflamed             weak         0.2
## 86                gse193677:rectum_uc_inflamed         moderate         0.4
## 87                gse193677:rectum_uc_inflamed           strong         0.6
## 88                gse193677:rectum_uc_inflamed      very strong         0.8
##    sample_size_statistical sample_size_statistical_corrected sample_size_z
## 1                68.773025                         914.63989     196.99184
## 2                18.002295                         216.05522     154.37229
## 3                 8.523611                          85.91378      97.82438
## 4                 5.063346                          38.90078      49.91944
## 5                68.773025                         914.18952     196.90537
## 6                18.002295                         215.94977     154.30369
## 7                 8.523611                          85.87258      97.77885
## 8                 5.063346                          38.88278      49.89271
## 9                68.773025                         945.64424     203.65958
## 10               18.002295                         223.31440     159.55817
## 11                8.523611                          88.74980     101.07195
## 12                5.063346                          40.13973      51.49231
## 13               68.773025                         946.59975     203.84314
## 14               18.002295                         223.53812     159.70378
## 15                8.523611                          88.83721     101.16863
## 16                5.063346                          40.17791      51.54901
## 17               68.773025                         942.75523     203.01564
## 18               18.002295                         222.63799     159.04099
## 19                8.523611                          88.48554     100.75184
## 20                5.063346                          40.02428      51.32086
## 21               68.773025                         946.05349     203.73821
## 22               18.002295                         223.41022     159.62054
## 23                8.523611                          88.78724     101.11336
## 24                5.063346                          40.15608      51.51660
## 25               68.773025                         947.71732     204.05778
## 26               18.002295                         223.79978     159.87405
## 27                8.523611                          88.93943     101.28168
## 28                5.063346                          40.22257      51.61533
## 29               68.773025                         944.17760     203.28879
## 30               18.002295                         222.97101     159.33462
## 31                8.523611                          88.61565     100.89565
## 32                5.063346                          40.08112      51.40528
## 33               68.773025                         949.12151     204.41649
## 34               18.002295                         224.12855     160.08794
## 35                8.523611                          89.06788     101.42372
## 36                5.063346                          40.27868      51.69864
## 37               68.773025                         947.16262     203.95126
## 38               18.002295                         223.66991     159.78954
## 39                8.523611                          88.88869     101.22557
## 40                5.063346                          40.20040      51.58241
## 41               68.773025                         945.75920     203.68167
## 42               18.002295                         223.34132     159.57569
## 43                8.523611                          88.76032     101.08359
## 44                5.063346                          40.14432      51.49913
## 45               68.773025                         945.95881     203.72002
## 46               18.002295                         223.38805     159.60611
## 47                8.523611                          88.77858     101.10378
## 48                5.063346                          40.15230      51.51098
## 49               68.773025                         945.13773     203.56227
## 50               18.002295                         223.19581     159.48097
## 51                8.523611                          88.70347     101.02071
## 52                5.063346                          40.11949      51.46225
## 53               68.773025                         938.69085     202.14591
## 54               18.002295                         221.68638     158.42167
## 55                8.523611                          88.11376     100.34082
## 56                5.063346                          39.86187      51.07961
## 57               68.773025                         940.58126     202.59800
## 58               18.002295                         222.12898     158.70978
## 59                8.523611                          88.28668     100.53201
## 60                5.063346                          39.93741      51.19182
## 61               68.773025                         940.04191     202.40535
## 62               18.002295                         222.00271     158.62759
## 63                8.523611                          88.23735     100.47747
## 64                5.063346                          39.91586      51.15981
## 65               68.773025                         938.83594     202.17378
## 66               18.002295                         221.72035     158.44378
## 67                8.523611                          88.12704     100.35549
## 68                5.063346                          39.86767      51.08822
## 69               68.773025                         939.18970     202.24171
## 70               18.002295                         221.80317     158.49771
## 71                8.523611                          88.15940     100.39127
## 72                5.063346                          39.88180      51.10922
## 73               68.773025                         923.39357     198.94043
## 74               18.002295                         218.10476     155.86004
## 75                8.523611                          86.71449      98.73771
## 76                5.063346                          39.25058      50.43868
## 77               68.773025                         940.92339     202.66374
## 78               18.002295                         222.20909     158.76192
## 79                8.523611                          88.31798     100.56661
## 80                5.063346                          39.95108      51.21213
## 81               68.773025                         940.92339     202.66374
## 82               18.002295                         222.20909     158.76192
## 83                8.523611                          88.31798     100.56661
## 84                5.063346                          39.95108      51.21213
## 85               68.773025                         940.92339     202.66374
## 86               18.002295                         222.20909     158.76192
## 87                8.523611                          88.31798     100.56661
## 88                5.063346                          39.95108      51.21213
dataset_selected = "gtex:whole.blood"
model_selected = "Stretched exponential (by linear fit)"
results_analytical_gtex_wb = topology_results_selected_analytical_norm_df %>% 
  filter((model == model_selected) & (type_dataset == dataset_selected)) %>%
  select(model, type_dataset, a, b, L, max_value_in_dataset) %>% unique()
sample_size_correlation_gtex_wb = sample_size_correlation_df %>%
  filter((dataset == dataset_selected) & (!(type_correlation == "weak")))

predictions_convergence_gtex_wb_norm = calculate_predictions_using_stretched_exponential_model_optimized(x=sample_size_correlation_gtex_wb$sample_size_statistical_corrected, L=results_analytical_gtex_wb$L, a=results_analytical_gtex_wb$a, b=results_analytical_gtex_wb$b)
predictions_convergence_gtex_wb_df = data.frame(size=round(sample_size_correlation_gtex_wb$sample_size_statistical_corrected), num_edges=predictions_convergence_gtex_wb_norm * results_analytical_gtex_wb$max_value_in_dataset, num_edges_norm=predictions_convergence_gtex_wb_norm)
predictions_convergence_gtex_wb_df$num_edges_norm = (predictions_convergence_gtex_wb_df$num_edges/results_analytical_gtex_wb$max_value_in_dataset)/results_analytical_gtex_wb$L
predictions_convergence_gtex_wb_df
##   size num_edges num_edges_norm
## 1  218  42782199     0.37291614
## 2   87  19008527     0.16569009
## 3   39   5649757     0.04924678
predicted_results_wb_df = predicted_results_df %>% filter((model == model_selected) & (type_dataset == dataset_selected)) %>% mutate_at(c('model_result'), as.numeric)
predicted_results_wb_df$model_result_norm = (predicted_results_wb_df$model_result/results_analytical_gtex_wb$max_value_in_dataset)/results_analytical_gtex_wb$L
#mtcars_mod = mtcars %>% mutate_at(c("carb"), as.character)
#mtcars_mod %>% ggplot(aes(mpg, wt)) + geom_point(aes(col=carb, size=gear))

dataset_selected = "Whole.Blood"
all_topology_results_file = '/home/j.aguirreplans/Projects/Scipher/SampleSize/scripts/SampleSizeShiny/data/analysis_topology.csv'
threshold_selected = 0.05

all_results_df = fread(all_topology_results_file)
results_gtex_wb = all_results_df %>% 
  filter((type_dataset == dataset_selected) & (threshold == threshold_selected) & (type_correlation %in% c("weak-moderate-strong-very strong", "moderate-strong-very strong", "strong-very strong", "very strong")))
results_gtex_wb$num_edges_norm = (results_gtex_wb$num_edges/results_analytical_gtex_wb$max_value_in_dataset)/results_analytical_gtex_wb$L

correlation_levels_plot_df = results_gtex_wb %>% 
  mutate(type_correlation = replace(type_correlation, type_correlation == "weak-moderate-strong-very strong", "\u2265 0.2")) %>%
  mutate(type_correlation = replace(type_correlation, type_correlation == "moderate-strong-very strong", "\u2265 0.4")) %>%
  mutate(type_correlation = replace(type_correlation, type_correlation == "strong-very strong", "\u2265 0.6")) %>%
  mutate(type_correlation = replace(type_correlation, type_correlation == "very strong", "\u2265 0.8")) %>%
  full_join((predicted_results_wb_df %>% filter(size <= max((all_results_df %>% filter((type_dataset == dataset_selected) & (threshold == threshold_selected)))$size)) %>% mutate(type_correlation="Model prediction")), by=c("type_dataset", "size", "type_correlation", "num_edges_norm"="model_result_norm")) %>%
  full_join((predictions_convergence_gtex_wb_df  %>% mutate(type_correlation="Convergence point")), by=c("size", "num_edges", "num_edges_norm", "type_correlation")) %>%
  left_join( (name2color %>% as.data.frame() %>% dplyr::rename("rgb_col"=".") %>% tibble::rownames_to_column("type_correlation")), by=c("type_correlation") )

correlation_levels_plot = correlation_levels_plot_df %>%
  ggplot(aes(x=size, y=num_edges_norm)) +
  #geom_point(data=.%>% filter(!(type_correlation %in% c("Convergence point", "Model prediction"))), aes(fill=factor(type_correlation), size=num_edges_norm), alpha=0.5, shape=21) +
  geom_point(data=.%>% filter(!(type_correlation %in% c("Convergence point", "Model prediction"))), aes(col=type_correlation, size=num_edges_norm), alpha=0.5) +
  scale_color_manual(name = "Correlation strength", values = as.vector(name2color[levels(factor((correlation_levels_plot_df %>% filter(!(type_correlation == "Model prediction")))$type_correlation))])) + # Plot using the dictionary
  new_scale_color() +
  #geom_point(data=.%>% filter(type_correlation == "Convergence point"), aes(fill=factor(type_correlation)), alpha=1, size=4, shape=21) +
  geom_point(data=.%>% filter(type_correlation == "Convergence point"), aes(col=factor(type_correlation)), alpha=1, size=3) +
  scale_color_manual(name = NULL, values = c("red")) + # Plot using the dictionary
  new_scale_color() +
  geom_line(data=.%>% filter(type_correlation == "Model prediction"), aes(col=type_correlation), linewidth=1) +
  scale_color_manual(name = NULL, values = c("red")) + # Plot using the dictionary
  #scale_x_continuous(trans = scales::log10_trans()) +
  #scale_color_manual(values = c("#D55E00", "#E69F00", "#44AA99", "#0072B2", "#56B4E9")) +
  theme_linedraw() +
  xlab("Num. samples") +
  #xlab("log(N)") +
  ylab("Frac. significant correlations") +
  guides(col=guide_legend(override.aes = list(alpha = 1, size = 3)), col=guide_legend(title=NULL), size=guide_legend(title="Frac. sig. correlations")) +
  theme(aspect.ratio=1,
        plot.title =  element_text(size = 20, face="bold"), 
        axis.title = element_text(size = 17, face="bold"), 
        axis.text = element_text(size = 16),
        panel.grid.major=element_blank(), 
        panel.grid.minor = element_blank(),
        legend.text = element_text(size = 16), 
        legend.title=element_text(size=16, face="bold"),
        text = element_text(family = "Helvetica"))

plot_file = paste(plots_dir, "correlation_levels_pearson_pval_0.05_data_gtexwholeblood.png", sep="/")
ggsave(
  plot_file,
  plot = correlation_levels_plot,
  dpi = 1200,
  width = 9000,
  height = 6000,
  units = c("px")
)
print(correlation_levels_plot)

Standard deviation of co-expression value plot

Read data:

sd_table_file = paste(tables_dir, "analysis_sd_coexpression_weight_gtex_Whole.Blood.txt", sep="/")
sd_table_df = fread(sd_table_file)

Plot:

# Define palette using the scale_fill_brewer Oranges palette and selecting manually the colors using RColorBrewer
# https://stackoverflow.com/questions/29466239/how-to-set-the-color-range-of-scale-colour-brewer-in-ggplot2-palette-selecte 
my_orange = RColorBrewer::brewer.pal(n = (length(unique(sd_table_df$ss)) + 2), "Oranges")[3:(length(unique(sd_table_df$ss))+2)] # I get 8 and exclude the 2 first colors, which are more light 

sd_plot_file = paste(plots_dir, "analysis_sd_coexpression_weight_gtex_Whole.Blood.png", sep="/")
sd_plot = sd_table_df %>%
  mutate(ss = factor(ss, levels = as.character(sort(as.numeric(unique(sd_table_df$ss)))))) %>%
  ggplot(aes(x = ss, y = sd_correlation)) +
  #geom_half_violin(side = "r", color="orangered4", fill="orangered1") +
  geom_half_violin(aes(fill=ss), color="transparent", side = "r") +
  #scale_fill_brewer(palette="Oranges") +
  scale_fill_manual(values=my_orange) +
  theme_linedraw() +
  theme(aspect.ratio=1, 
        plot.title =  element_text(size = 20, face="bold"), 
        axis.title = element_text(size = 17, face="bold"), 
        axis.text = element_text(size = 16), 
        panel.grid.major=element_blank(), 
        panel.grid.minor = element_blank(),
        text = element_text(family = "Helvetica"),
        legend.position="none") +
  labs (x = "Num. samples",
        y = "Pearson correlation SD",
        color = "",
        fill = "")
ggsave(
  sd_plot_file,
  plot = sd_plot,
  dpi = 1200,
  #width = 9300,
  #height = 6000,
  units = c("px")
)
## Saving 8400 x 6000 px image
print(sd_plot)

rm(sd_table_df)

Patchwork:

# Use patchwork to concatenate plots
(correlation_levels_plot /
  specific_links_plot /
  sd_plot) + 
  plot_annotation(tag_levels = 'A') & # Include letters 
    theme(plot.tag = element_text(face = 'bold', size = 17))

plot_file = paste(plots_dir, "/sd_coexpression_results.png", sep="")
ggsave(
  plot_file,
  dpi = 1200,
  width = 10000,
  height = 17000,
  units = c("px")
)

Supplementary figure with the analysis of SD for different datasets:

datasets_analysis = c("tcga_TCGA-BRCA_female", "tcga_TCGA-LUAD")
datasets_analysis_titles = c("TCGA: Breast cancer", "TCGA: Lung cancer")
sd_plots = list()
for (x in 1:length(datasets_analysis)){
  dataset_selected = datasets_analysis[x]
  sd_table_file = paste(tables_dir, "/analysis_sd_coexpression_weight_", dataset_selected, ".txt", sep="")
  sd_table_df = fread(sd_table_file)
  sd_plot_file = paste(plots_dir, "/analysis_sd_coexpression_weight_", dataset_selected, ".png", sep="")
  my_orange = RColorBrewer::brewer.pal(n = (length(unique(sd_table_df$ss)) + 2), "Oranges")[3:(length(unique(sd_table_df$ss))+2)]
  sd_plot = sd_table_df %>%
    mutate(ss = factor(ss, levels = as.character(sort(as.numeric(unique(sd_table_df$ss)))))) %>%
    ggplot(aes(x = ss, y = sd_correlation)) +
    #geom_half_violin(side = "r", color="orangered4", fill="orangered1") +
    geom_half_violin(aes(fill=ss), color="transparent", side = "r") +
    #scale_fill_brewer(palette="Oranges") +
    scale_fill_manual(values=my_orange) +
    theme_linedraw() +
    theme(aspect.ratio=1, 
          plot.title =  element_text(size = 20, face="bold"), 
          axis.title = element_text(size = 17, face="bold"), 
          axis.text = element_text(size = 16), 
          panel.grid.major=element_blank(), 
          panel.grid.minor = element_blank(),
          text = element_text(family = "Helvetica"),
          legend.position="none") +
    labs (x = "Num. samples",
          y = "Pearson correlation SD",
          title = datasets_analysis_titles[[x]],
          color = "",
          fill = "")
  sd_plots[[x]] = sd_plot
  ggsave(
    sd_plot_file,
    plot = sd_plot,
    dpi = 1200,
    width = 6500,
    height = 6500,
    units = c("px")
  )
  print(sd_plot)
  rm(sd_table_df)
}

Same plots but for small sample sizes:

datasets_analysis = c("tcga_TCGA-BRCA_female", "tcga_TCGA-LUAD")
datasets_analysis_titles = c("TCGA: Breast cancer", "TCGA: Lung cancer")
sd_plots_small_sizes = list()
for (x in 1:length(datasets_analysis)){
  dataset_selected = datasets_analysis[x]
  sd_table_file = paste(tables_dir, "/analysis_sd_coexpression_weight_with_small_sizes_", dataset_selected, ".txt", sep="")
  sd_table_df = fread(sd_table_file)
  sd_plot_file = paste(plots_dir, "/analysis_sd_coexpression_weight_with_small_sizes_", dataset_selected, ".png", sep="")
  my_orange = RColorBrewer::brewer.pal(n = (length(unique(sd_table_df$ss)) + 2), "Oranges")[3:(length(unique(sd_table_df$ss))+2)]
  sd_plot_small_sizes = sd_table_df %>%
    mutate(ss = factor(ss, levels = as.character(sort(as.numeric(unique(sd_table_df$ss)))))) %>%
    ggplot(aes(x = ss, y = sd_correlation)) +
    #geom_half_violin(side = "r", color="orangered4", fill="orangered1") +
    geom_half_violin(aes(fill=ss), color="transparent", side = "r") +
    #scale_fill_brewer(palette="Oranges") +
    scale_fill_manual(values=my_orange) +
    theme_linedraw() +
    theme(aspect.ratio=1, 
          plot.title =  element_text(size = 20, face="bold"), 
          axis.title = element_text(size = 17, face="bold"), 
          axis.text = element_text(size = 16), 
          panel.grid.major=element_blank(), 
          panel.grid.minor = element_blank(),
          text = element_text(family = "Helvetica"),
          legend.position="none") +
    labs (x = "Num. samples",
          y = "Pearson correlation SD",
          title = datasets_analysis_titles[[x]],
          color = "",
          fill = "")
  sd_plots_small_sizes[[x]] = sd_plot_small_sizes
  ggsave(
    sd_plot_file,
    plot = sd_plot_small_sizes,
    dpi = 1200,
    width = 6500,
    height = 6500,
    units = c("px")
  )
  print(sd_plot_small_sizes)
  rm(sd_table_df)
}

Patchwork for the supplementary figure:

layout <- "
AB
"
patch_breast = ((sd_plots_small_sizes[[1]] + labs(tag = 'A', title = NULL)) +
  (sd_plots[[1]] + labs(tag = 'B', title = NULL))) +
  plot_annotation(title = 'TCGA: Breast cancer', theme=theme(plot.title =  element_text(size = 20, face="bold"))) &
  theme(plot.tag = element_text(face = 'bold', size = 17),
        plot.margin = margin(0.6, 0.6, 0.6, 0.6, "cm"))

layout <- "
CD
"
patch_lung = ((sd_plots_small_sizes[[2]] + labs(tag = 'C', title = NULL)) +
  (sd_plots[[2]] + labs(tag = 'D', title = NULL))) +
  plot_annotation(title = 'TCGA: Lung cancer', theme=theme(plot.title =  element_text(size = 20, face="bold"))) &
  theme(plot.tag = element_text(face = 'bold', size = 17),
        plot.margin = margin(0.6, 0.6, 0.6, 0.6, "cm"))

(wrap_elements(patch_breast) / wrap_elements(patch_lung))

plot_file = paste(plots_dir, "/analysis_sd_coexpression_weight_SI.png", sep="")
ggsave(
  plot_file,
  dpi = 1200,
  width = 12000,
  height = 14000,
  units = c("px")
)

Differential co-expression analysis plots

name_D = "tcga.brca.female"
name_N = "tcga.breast.female"
diffcoexp_plots_dir = "/home/j.aguirreplans/Projects/Scipher/SampleSize/data/out/plots/plots_differential_coexpression/TCGA-BRCA_female___TCGA-Breast_female"
diffcoexp_tables_dir = "/home/j.aguirreplans/Projects/Scipher/SampleSize/data/out/tables/tables_differential_coexpression/TCGA-BRCA_female___TCGA-Breast_female"
name2color_df = name2color %>% as.data.frame() %>% dplyr::rename("rgb_col"=".") %>% tibble::rownames_to_column("name")

Read file:

change_of_category_file = paste(diffcoexp_tables_dir, paste("codina_", name_D, "_", name_N, "_changes.txt", sep=""), sep="/")
change_of_category_df = fread(change_of_category_file)

Make plot:

sizes_list = sort(unique(change_of_category_df$size.prev))
if (max(sizes_list) <= 160) {
  sizes_to_plot = c(20, 40, 60, 80, 100, 120, 140, 160)
} else if ((max(sizes_list) > 160) & (max(sizes_list) <= 300)) {
  sizes_to_plot = c(20, 60, 100, 140, 180, 220, 260, 300)
} else if ((max(sizes_list) > 300) & (max(sizes_list) <= 440)) {
  sizes_to_plot = c(20, 80, 140, 200, 260, 320, 380, 440)
} else {
  sizes_to_plot = seq(20, max(sizes_list), 100)
}

# Create last column of plot (size 100)
change_of_category_size100_df = change_of_category_df %>%
  filter(size.curr == 100) %>%
  dplyr::select(Node, Phi_name.curr, size.curr) %>%
  dplyr::rename("Phi_name.prev"="Phi_name.curr", "size.prev"="size.curr") %>%
  mutate(Phi_name.curr = NA) %>%
  mutate(size.curr = NA) %>%
  mutate(transition_type = NA) %>%
  mutate(transition_name = NA) %>%
  mutate(transition_name_simple = Phi_name.prev) %>%
  mutate(transition_name_simple = replace(transition_name_simple, transition_name_simple == "common", "common (constant)")) %>% 
  mutate(transition_name_simple = replace(transition_name_simple, transition_name_simple == "disease-specific", "disease-specific (constant)")) %>% 
  mutate(transition_name_simple = replace(transition_name_simple, transition_name_simple == "normal-specific", "normal-specific (constant)")) %>% 
  mutate(transition_name_simple = replace(transition_name_simple, transition_name_simple == "undefined", "undefined (constant)")) %>% 
  left_join(name2color_df, by=c("Phi_name.prev"="name"))

# Count transitions / merge with colors / merge with last column
change_of_category_col_df = change_of_category_df %>%
  mutate(transition_name_simple = transition_name) %>%
  mutate(transition_name_simple = replace(transition_name_simple, transition_name_simple == "common / common", "common (constant)")) %>% 
  mutate(transition_name_simple = replace(transition_name_simple, transition_name_simple %in% c("common / disease-specific", "common / normal-specific", "common / undefined"), "common (change)")) %>% 
  mutate(transition_name_simple = replace(transition_name_simple, transition_name_simple == "disease-specific / disease-specific", "disease-specific (constant)")) %>% 
  mutate(transition_name_simple = replace(transition_name_simple, transition_name_simple %in% c("disease-specific / common", "disease-specific / normal-specific", "disease-specific / undefined"), "disease-specific (change)")) %>% 
  mutate(transition_name_simple = replace(transition_name_simple, transition_name_simple == "normal-specific / normal-specific", "normal-specific (constant)")) %>% 
  mutate(transition_name_simple = replace(transition_name_simple, transition_name_simple %in% c("normal-specific / common", "normal-specific / disease-specific", "normal-specific / undefined"), "normal-specific (change)")) %>% 
  mutate(transition_name_simple = replace(transition_name_simple, transition_name_simple == "undefined / undefined", "undefined (constant)")) %>% 
  mutate(transition_name_simple = replace(transition_name_simple, transition_name_simple %in% c("undefined / disease-specific", "undefined / normal-specific", "undefined / common"), "undefined (change)")) %>% 
  left_join(name2color_df, by=c("transition_name_simple"="name")) %>%
  full_join(change_of_category_size100_df)
## Joining, by = c("Node", "Phi_name.prev", "size.prev", "Phi_name.curr",
## "size.curr", "transition_type", "transition_name", "transition_name_simple",
## "rgb_col")
# Define factors
change_of_category_col_df$transition_name_simple <- factor(change_of_category_col_df$transition_name_simple, levels=c("common (constant)", "common (change)", "disease-specific (constant)",  "disease-specific (change)", "normal-specific (constant)", "normal-specific (change)", "undefined (constant)", "undefined (change)"))
change_of_category_col_df$size.prev = as.character(change_of_category_col_df$size.prev)
change_of_category_col_df$size.prev <- factor(change_of_category_col_df$size.prev, levels=as.character(sort(unique(as.integer(change_of_category_col_df$size.prev)))))

# Plot
plot_flow = change_of_category_col_df %>% 
  filter(size.prev %in% sizes_to_plot) %>%
  ggplot(aes(x = size.prev, stratum = transition_name_simple, alluvium = Node, 
             fill = transition_name_simple, label = transition_name_simple)) +
  geom_flow(stat = "alluvium", lode.guidance = "frontback") +
  #geom_stratum() +
  geom_stratum(aes(col=transition_name_simple), size=0) +
  scale_fill_manual(name="Gene category", values=setNames(change_of_category_col_df$rgb_col, change_of_category_col_df$transition_name_simple)) + # If I want to plot only specific elements in the legend, use the argument breaks with a list of the elements that I want to show
  scale_color_manual(name="Gene category", values=setNames(change_of_category_col_df$rgb_col, change_of_category_col_df$transition_name_simple)) +
  xlab("Num. samples") +
  ylab("Num. disease genes") +
  guides(fill=guide_legend(title="Gene category")) +
  theme_linedraw() +
  theme(aspect.ratio=1,
        plot.title =  element_text(size = 20, face="bold"), 
        axis.title = element_text(size = 17, face="bold"), 
        axis.text = element_text(size = 16),
        panel.grid.major=element_blank(), 
        panel.grid.minor = element_blank(),
        legend.text = element_text(size = 16), 
        legend.title=element_text(size=16, face="bold"),
        text = element_text(family = "Helvetica"))
plot_file = paste(diffcoexp_plots_dir, paste("codina_", name_D, "_", name_N, "_change_disease_genes_improved.png", sep=""), sep="/")
ggsave(
  plot_file,
  plot = plot_flow,
  dpi = 1200,
  width = 10000,
  height = 6000,
  units = c("px")
)
## Warning: Using the `size` aesthetic in this geom was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` in the `default_aes` field and elsewhere instead.
print(plot_flow)

Calculate stable genes by category:

# Calculate total of stable genes
total_stable_change_genes_df = change_of_category_df %>%
  group_by(size.curr) %>% 
  mutate(n_total = n()) %>%
  ungroup() %>%
  select(Node, Phi_name.curr, size.curr, transition_type, n_total) %>% 
  group_by(size.curr, transition_type, n_total) %>% 
  summarize(n_genes = n()) %>% 
  ungroup() %>%
  mutate(frac_genes = n_genes / n_total)
## `summarise()` has grouped output by 'size.curr', 'transition_type'. You can
## override using the `.groups` argument.
total_stable_change_genes_df$Phi_name.curr = "all"

# Calculate stable genes by category
categories_stable_change_genes_df = change_of_category_df %>%
  group_by(size.curr) %>% 
  mutate(n_total = n()) %>%
  ungroup() %>%
  filter(transition_type == "stable") %>%
  select(Node, Phi_name.curr, size.curr, transition_type, n_total) %>% 
  group_by(Phi_name.curr, size.curr, transition_type, n_total) %>% 
  summarize(n_genes = n()) %>% 
  ungroup() %>%
  mutate(frac_genes = n_genes / n_total)
## `summarise()` has grouped output by 'Phi_name.curr', 'size.curr',
## 'transition_type'. You can override using the `.groups` argument.
# Merge two tables
stable_genes_df = rbind(total_stable_change_genes_df, categories_stable_change_genes_df) %>%
  filter(transition_type == "stable") %>%
  left_join(name2color_df, by=c("Phi_name.curr"="name"))

Make plot:

plot_stable_genes = stable_genes_df %>%
  ggplot(aes(x = size.curr, y = frac_genes, col = Phi_name.curr)) + 
  geom_line(aes(size = Phi_name.curr)) +
  xlab("Num. samples") +
  ylab("Fraction of stable disease genes") +
  scale_size_manual(values = c("all" = 1, "common" = 0.5, "disease-specific" = 0.5, "normal-specific" = 0.5, "undefined" = 0.5, "different" = 0.5)) +
  scale_color_manual(values=setNames(stable_genes_df$rgb_col, stable_genes_df$Phi_name.curr)) +
  guides(col=guide_legend(title="Gene category"), size="none") +
  theme_linedraw() +
  theme(aspect.ratio=1,
        plot.title =  element_text(size = 20, face="bold"), 
        axis.title = element_text(size = 17, face="bold"), 
        axis.text = element_text(size = 16),
        panel.grid.major=element_blank(), 
        panel.grid.minor = element_blank(),
        legend.text = element_text(size = 16), 
        legend.title=element_text(size=16, face="bold"),
        text = element_text(family = "Helvetica"))

plot_file = paste(diffcoexp_plots_dir, paste("codina_", name_D, "_", name_N, "_stable_genes.png", sep=""), sep="/")
ggsave(
  plot_file,
  plot = plot_stable_genes,
  dpi = 1200,
  width = 9000,
  height = 6000,
  units = c("px")
)
print(plot_stable_genes)

Calculate ranking of widely known genes:

ranking_nodes_file = paste(diffcoexp_tables_dir, paste("codina_", name_D, "_", name_N, "_nodes_ranked.txt", sep=""), sep="/")
ranking_nodes_df= fread(ranking_nodes_file)
nodes_to_follow_file = "/home/j.aguirreplans/Projects/Scipher/SampleSize/data/nodes_to_follow/breast_cancer.txt"
nodes_to_follow = fread(nodes_to_follow_file, header = F)$V1

Make plot:

nodes_to_follow_ranking_plot = ranking_nodes_df %>%
  filter(Node %in% nodes_to_follow) %>%
  mutate(label=if_else(size==max(size), Node, "")) %>%
  ggplot(aes(x = size, y = rank)) + 
    geom_point(aes(group = Node, col = Phi_name)) +
    geom_line(aes(group = Node, col = Phi_name)) +
    xlab("Num. samples") +
    ylab("Ranking") +
    scale_color_manual(values=as.vector(name2color_df[name2color_df$name %in% levels(factor(unique((ranking_nodes_df %>% filter(Node %in% nodes_to_follow))$Phi_name))),]$rgb_col)) +
    scale_y_continuous(trans = "reverse") +
    guides(col=guide_legend(title="Gene category"), size="none") +
    theme_linedraw() +
    theme(aspect.ratio=1,
          plot.title =  element_text(size = 20, face="bold"), 
          axis.title = element_text(size = 17, face="bold"), 
          axis.text = element_text(size = 16),
          panel.grid.major=element_blank(), 
          panel.grid.minor = element_blank(),
          legend.text = element_text(size = 16), 
          legend.title=element_text(size=16, face="bold"),
          text = element_text(family = "Helvetica")) +
    geom_label_repel(aes(col=Phi_name, label = label),
                     fill="white",
                     box.padding = 0.2, max.overlaps = Inf,
                     label.size = 0.75, 
                     size = 5,
                     alpha = 0.9, 
                     label.padding=0.25, 
                     fontface = "bold",
                     na.rm=TRUE,
                     show.legend=F,
                     seed = 1234)

plot_file = paste(diffcoexp_plots_dir, "nodes_to_follow_ranking.png", sep="/")
ggsave(
  plot_file,
  plot=nodes_to_follow_ranking_plot,
  dpi = 1200,
  #width = 9500,
  height = 6000,
  units = c("px")
)
## Saving 8400 x 6000 px image
print(nodes_to_follow_ranking_plot)

Use patchwork to plot everything together:

(plot_flow /
  nodes_to_follow_ranking_plot /
  plot_stable_genes) + 
  plot_annotation(tag_levels = 'A') & # Include letters 
  theme(plot.tag = element_text(face = 'bold', size = 17)
        ,plot.tag.position  = c(.05, 1.04)
        ,plot.margin = margin(0.6, 0.6, 0.6, 0.6, "cm")
        ) # that are bold

plot_file = paste(diffcoexp_plots_dir, "/codina_stability_results.png", sep="")
ggsave(
  plot_file,
  dpi = 1200,
  width = 11000,
  #height = 12000,
  height = 17500,
  units = c("px")
)

Protein-protein interaction plots

Load PPI + co-expression dataframe:

dataset = "gtex"
type_dataset = "Whole.Blood"
sample_group_type = "tumor" # if dataset = "tcga"
type_counts = "reads" # tpm, reads
sizes_selected = c(20, 100, 200, 300, 400, 500)
sizes_selected_comparison_plot = c(20, 100, 200, 300)
threshold_selected = 0.05
coexpression_pairs_file = sprintf("/home/j.aguirreplans/Projects/Scipher/SampleSize/data/out/tables/ppi_analysis_pairs_pval_%s_%s_%s.txt", as.character(threshold_selected), dataset, type_dataset)
coexpression_pairs_filt = fread(coexpression_pairs_file)
perf_file = sprintf("/home/j.aguirreplans/Projects/Scipher/SampleSize/data/out/tables/ppi_coexpression_analysis_ppi_vs_random_far_pairs_performance_%s_%s.txt", dataset, type_dataset)
perf_df = fread(perf_file)

colors_defined = c("PPI" = "#DBA9CA",
                   "PPI & iNCI" = "#FAF69E",
                   "Random" = "#B8BECC",
                   "Random pairs in PPI" = "#B8BECC",
                   "Random (distance > 4)" = "#543F4D",
                   "Distance > 4" = "#543F4D",
                   "Random pairs in PPI (dist. > 4)" = "#543F4D",
                   "Random pairs in PPI & iNCI (dist. > 4)" = "#545334",
                   "1" = "#DBA9CA",
                   "2" = "#a8829b",
                   "3" = "#87677c",
                   "4" = "#5e4857",
                   "5" = "#40303b",
                   "6" = "#1c151a")

Violin plot of PPI vs. pairs > 4 distance across sample size:

violinplot_ppi_coexpression_by_types_nozero_file =  sprintf("/home/j.aguirreplans/Projects/Scipher/SampleSize/data/out/plots/ppi_coexpression_violinplot_by_pair_type_nozero_pval_%s_%s_%s.png", as.character(threshold_selected), dataset, type_dataset)
colors_defined_filt = colors_defined[names(colors_defined) %in% c("PPI", "Distance > 4")]
violinplot_ppi_coexpression_by_types_nozero = coexpression_pairs_filt %>%
  select(-distances_ppi, -distances_nci, -nci, -random_not_ppi, -random_far_not_nci) %>%
  pivot_longer(c("ppi", "random_far_not_ppi"), names_to = "category", values_to = "category_bool") %>%
  filter(category_bool == TRUE) %>%
  select(-category_bool) %>%
  pivot_longer(as.character(sizes_selected), names_to = "size", values_to = "correlation", names_transform = list(size = as.character), values_transform = list(correlation = as.numeric)) %>%
  filter(size %in% sizes_selected_comparison_plot) %>%
  filter(!(is.na(correlation))) %>%
  filter(abs(correlation) > 0 ) %>% 
  mutate(category = replace(category, category == "ppi", "PPI")) %>%
  mutate(category = replace(category, category == "random_far_not_ppi", "Distance > 4")) %>%
  mutate(category = factor(category, levels = c("PPI", "Distance > 4"))) %>%
  ggplot() +
  aes(x = reorder(size, as.numeric(size)),
      y = abs(correlation),
      fill = category,
      colour = category) +
  geom_violin(alpha = 0.8, 
              scale = "width", 
              adjust = 0.5) +
  scale_fill_manual(values = colors_defined_filt) +
  scale_color_manual(values = colors_defined_filt) +
  theme_linedraw() +
  theme(aspect.ratio=1,
        plot.title =  element_text(size = 20, face="bold"), 
        axis.title = element_text(size = 17, face="bold"), 
        axis.text = element_text(size = 16),
        panel.grid.major=element_blank(), 
        panel.grid.minor = element_blank(),
        legend.text = element_text(size = 16), 
        legend.title=element_text(size=16, face="bold"),
        text = element_text(family = "Helvetica")) +
  labs (x = "Number of samples",
        y = "Absolute co-expression weight",
        color = "",
        fill = ""); violinplot_ppi_coexpression_by_types_nozero

ggsave(
  violinplot_ppi_coexpression_by_types_nozero_file,
  plot = violinplot_ppi_coexpression_by_types_nozero,
  dpi = 1200,
  #width = 9300,
  #height = 6000,
  units = c("px")
)
## Saving 8400 x 6000 px image

Violin plot of PPI distances across sample size:

violinplot_ppi_coexpression_by_distances_nozero_file = sprintf("/home/j.aguirreplans/Projects/Scipher/SampleSize/data/out/plots/ppi_coexpression_violinplot_by_ppi_distance_nozero_pval_%s_%s_%s.png", as.character(threshold_selected), dataset, type_dataset)
colors_defined_filt = colors_defined[names(colors_defined) %in% c("1", "2", "3", "4", "5", "6")]
violinplot_ppi_coexpression_by_distances_nozero = coexpression_pairs_filt %>%
  select(-ppi, -nci, -distances_nci, -random_not_ppi, -random_far_not_ppi, -random_far_not_nci) %>%
  filter(!(is.na(distances_ppi))) %>%
  pivot_longer(as.character(sizes_selected), names_to = "size", values_to = "correlation", names_transform = list(size = as.character), values_transform = list(correlation = as.numeric)) %>%
  filter(size %in% sizes_selected_comparison_plot) %>%
  filter(!(is.na(correlation))) %>%
  filter(abs(correlation) > 0 ) %>% 
  mutate(across(c(distances_ppi), as.character)) %>%
  ggplot() +
  aes(x = reorder(size, as.numeric(size)),
      y = abs(correlation),
      fill = distances_ppi,
      colour = distances_ppi) +
  geom_violin(alpha = 0.8, 
              scale = "width", 
              adjust = 0.5) +
  scale_fill_manual(values = colors_defined_filt) +
  scale_color_manual(values = colors_defined_filt) +
  theme_linedraw() +
  theme(aspect.ratio=1,
        plot.title =  element_text(size = 20, face="bold"), 
        axis.title = element_text(size = 17, face="bold"), 
        axis.text = element_text(size = 16),
        panel.grid.major=element_blank(), 
        panel.grid.minor = element_blank(),
        legend.text = element_text(size = 16), 
        legend.title=element_text(size=16, face="bold"),
        text = element_text(family = "Helvetica")) +
  labs (x = "Number of samples",
        y = "Absolute co-expression weight",
        color = "PPI distance",
        fill = "PPI distance")
ggsave(
  violinplot_ppi_coexpression_by_distances_nozero_file,
  plot = violinplot_ppi_coexpression_by_distances_nozero,
  dpi = 1200,
  #width = 9300,
  #height = 6000,
  units = c("px")
)
## Saving 8400 x 6000 px image
violinplot_ppi_coexpression_by_distances_nozero

AUC of PPI prediction:

perf_auc = perf_df %>%
  select(size, AUC) %>%
  unique()
print(perf_auc)
##    size       AUC
## 1:   20 0.6718979
## 2:  100 0.6992475
## 3:  200 0.6962787
## 4:  300 0.7063418
## 5:  400 0.7090533
## 6:  500 0.7072046
auc_plot = perf_auc %>% 
  mutate(size = factor(size, 
                       levels = sizes_selected, 
                       labels = sizes_selected)) %>% 
  ggplot() +
  aes(x = size, 
      #fill = base, 
      #colour = base, 
      weight = AUC) +
  geom_bar(col="#DBA9CA",
         fill="#DBA9CA") +
  #scale_fill_manual(values = colors_defined) +
  #scale_color_manual(values = colors_defined) +
  coord_cartesian(ylim=c(0.5,NA)) +
  theme_linedraw() +
  #facet_grid(vars(name))+ 
  theme(aspect.ratio=1,
        plot.title =  element_text(size = 20, face="bold"), 
        axis.title = element_text(size = 17, face="bold"), 
        axis.text = element_text(size = 16), 
        panel.grid.major=element_blank(), 
        panel.grid.minor = element_blank(),
        legend.text = element_text(size = 16), 
        legend.title=element_text(size=16, face="bold"),
        text = element_text(family = "Helvetica")) +
  labs(y = "AUROC",
       x = "Number of samples"
       )

auc_plot_file =  sprintf("/home/j.aguirreplans/Projects/Scipher/SampleSize/data/out/plots/ppi_coexpression_perf_AUC_pval_%s_%s_%s.png", as.character(threshold_selected), dataset, type_dataset)
ggsave(
  auc_plot_file,
  plot = auc_plot,
  dpi = 1200,
  #width = 9300,
  height = 6000,
  units = c("px")
)
## Saving 8400 x 6000 px image
 auc_plot

Patchwork (filtered by p-value):

# Use patchwork to concatenate plots
((violinplot_ppi_coexpression_by_types_nozero + theme(legend.position="bottom")) |
  (violinplot_ppi_coexpression_by_distances_nozero + theme(legend.position="bottom")) |
  auc_plot) + 
  plot_annotation(tag_levels = 'A') &
    theme(plot.tag = element_text(face = 'bold', size = 17),
          plot.tag.position  = c(.01, 1.06),
          plot.margin = margin(0.6, 0.6, 0.6, 0.6, "cm")
          )

plot_file = paste(plots_dir, "/ppi_coexpression_results.png", sep="")
ggsave(
  plot_file,
  dpi = 1200,
  width = 20000,
  height = 9000,
  units = c("px")
)